diff --git a/.gitignore b/.gitignore
index debec551d9cd7344a31efbbb709bfbb759a15d3f..801790d0a472080af607e9fbcde0284902a4ead8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,10 +6,14 @@ paddle/fluid/eager/api/generated/*
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/phi/api/backward/backward_api.h
+paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
+paddle/phi/api/include/sparse_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
+paddle/phi/api/lib/sparse_api.cc
+paddle/phi/api/lib/sparse_bw_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*
@@ -52,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
+tools/infrt/kernel_signature.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c5f711d2918bc2a2f8322cc9cd9f3a603c56ab1..6988434996bcc4745726b34278eb6007fdf8605f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
diff --git a/README.md b/README.md
index 7dc83aa695cef8ecf177dfc2c444888850342bdc..cdbf2d9f3bf9973fb6c7fe2365ea61f05ce998c1 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md)
 Welcome to the PaddlePaddle GitHub.
 
 PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 2.3 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
 
diff --git a/README_cn.md b/README_cn.md
index 6b37cfd97b35729dd293452178646db8f1194ca3..3834ee148f940326a2b1e1a8d0fd63a1028b0c96 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -15,7 +15,7 @@
 
 欢迎来到 PaddlePaddle GitHub
 
-飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者265万，服务企业10万家，基于飞桨开源深度学习平台产生了34万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
+飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者406万，服务企业15.7万家，基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
 
 ## 安装
 
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 41b90345c8c5f38afa413bd2411af975c9d0b103..d3f330ba9dd0fa58b26e9ea05a7154184747daff 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -26,7 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_GIT_TAG release/v0.1)
+set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index a7a9e85ffd7314ac7026fccdf45fae2fa3de09d3..5c48afa2806aab10bb08317679c0a00c8f177f7b 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -99,9 +99,10 @@ endfunction()
 
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
-  add_public_tablegen_target(${td_base}_IncGen)
-  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+  set(LLVM_TARGET_DEPENDS  ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
+  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
+  add_public_tablegen_target(MLIR${td_base}IncGen)
+  add_dependencies(mlir-headers MLIR${td_base}IncGen)
 endfunction()
 
 # Execute the mlir script with infrt-exec program.
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2162f87812d130f19262955798f28e2c2adc4bac
--- /dev/null
+++ b/cmake/external/onnxruntime.cmake
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if (NOT WITH_ONNXRUNTIME)
+  return()
+endif ()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+add_definitions(-DPADDLE_WITH_ONNXRUNTIME)
+
+SET(ONNXRUNTIME_PROJECT        "extern_onnxruntime")
+SET(ONNXRUNTIME_PREFIX_DIR     ${THIRD_PARTY_PATH}/onnxruntime)
+SET(ONNXRUNTIME_SOURCE_DIR     ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
+SET(ONNXRUNTIME_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/onnxruntime)
+SET(ONNXRUNTIME_INC_DIR        "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE)
+SET(ONNXRUNTIME_LIB_DIR        "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE)
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
+
+
+if (WIN32)
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip")
+elseif (APPLE)
+  SET(ONNXRUNTIME_URL           "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz")
+else ()
+  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz")
+endif()
+
+
+INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers.
+if (WIN32)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+elseif (APPLE)
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+else ()
+  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+endif ()
+
+if (WIN32)
+  ExternalProject_Add(
+      ${ONNXRUNTIME_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      URL                 ${ONNXRUNTIME_URL}
+      PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} &&
+                            ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
+                            ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+      BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+else ()
+  ExternalProject_Add(
+    ${ONNXRUNTIME_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                 ${ONNXRUNTIME_URL}
+    PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS  1
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    UPDATE_COMMAND        ""
+    INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
+                          ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
+    BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
+  )
+endif()
+
+ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
+ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT})
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..661c3675c84b27a7ed8210fec0cfeaa2c858487c
--- /dev/null
+++ b/cmake/external/paddle2onnx.cmake
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_ONNXRUNTIME)
+  return()
+endif()
+
+if (WITH_ARM)
+  message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
+  return()
+endif ()
+
+INCLUDE(ExternalProject)
+
+SET(PADDLE2ONNX_PROJECT        "extern_paddle2onnx")
+SET(PADDLE2ONNX_PREFIX_DIR     ${THIRD_PARTY_PATH}/paddle2onnx)
+SET(PADDLE2ONNX_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/paddle2onnx)
+SET(PADDLE2ONNX_INC_DIR        "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE)
+SET(PADDLE2ONNX_REPOSITORY     ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
+SET(PADDLE2ONNX_TAG            cpp)
+SET(LIBDIR "lib")
+SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
+
+INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers.
+if(WIN32)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE)
+    SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE)
+elseif(APPLE)
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+else()
+    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+endif(WIN32)
+
+
+# The protoc path is required to compile onnx.
+string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
+list(POP_BACK PROTOC_BIN_PATH)
+list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
+
+
+set(PADDLE2ONNX_OPTIONAL_ARGS
+      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+      -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
+      -DWITH_STATIC=OFF
+      -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+      ${EXTERNAL_OPTIONAL_ARGS}
+)
+
+if (WITH_PYTHON)
+  set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
+    -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
+    -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}
+  )
+endif ()
+
+
+ExternalProject_Add(
+    ${PADDLE2ONNX_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    ${SHALLOW_CLONE}
+    GIT_REPOSITORY      ${PADDLE2ONNX_REPOSITORY}
+    GIT_TAG             ${PADDLE2ONNX_TAG}
+    DEPENDS             protobuf
+    PREFIX              ${PADDLE2ONNX_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS       ${PADDLE2ONNX_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS    ${PADDLE2ONNX_LIB}
+)
+
+ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index f7cb7716969f5ccaa97d1ad7964510376b86870a..58ff5f0d2b715d117018eb2ff3d5989c8beb0694 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-    if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+
+    if(WITH_ONNXRUNTIME)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         v3.18.0)
+    elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
         SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
         SET(PROTOBUF_TAG         v3.8.0)
     elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
@@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ONNXRUNTIME)
+    SET(PROTOBUF_VERSION 3.18.0)
+elseif(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
     SET(PROTOBUF_VERSION 3.6.1)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 45a76fdc1f1a2aab66e7f4972eecbbec03af941a..cfbe68eecbaca55c5a288aae2c985bbc33d37be2 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index da81575188ffdee91887bd76b75b4a3d6eb60ae9..ba59eae392c66354b419bbfd2688a14a26f2e388 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -651,6 +651,7 @@ function(hip_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH")
   endif()
 endfunction(hip_test)
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c48d31f7e4f90296ecc48acb56e619aae129106e..851bd81403a85e52fbbb3c4c8bf0da1df63c8848 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST)
         endif()
     endif()
 
+    if (WITH_ONNXRUNTIME)
+        set(dst_dir "${DST}/third_party/install/onnxruntime")
+        copy(${TARGET}
+                SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
+                DSTS ${dst_dir} ${dst_dir})
+
+        set(dst_dir "${DST}/third_party/install/paddle2onnx")
+        if(WIN32)
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
+        else()
+            copy(${TARGET}
+                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
+                DSTS ${dst_dir}/include ${dst_dir}/lib)
+        endif()
+    endif()
+
     set(dst_dir "${DST}/third_party/install/gflags")
     copy(${TARGET}
             SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7affd59de162d5956672e5abfbf9f4b287fb7a83..1291e60cfe4ce13ca9aeeb3f8bdf068af0d5832c 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -293,11 +293,11 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op"
     "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
-    
-            if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-                set(pybind_flag 1)
-            endif()
-        endforeach()
+
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
@@ -478,7 +478,7 @@ function(op_library TARGET)
     if (${pybind_flag} EQUAL 0)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+        file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       elseif(${TARGET} STREQUAL "fake_quantize")
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index f6e15758379ada165a9dc0e31273a533b06ad2df..ebb686d8ad0f31917e64161d6f7d2ecd4644fadd 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -134,8 +134,8 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
             endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
-                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
+                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
             endif()
         endif()
         if (WITH_XPU)
@@ -197,92 +197,88 @@ function(kernel_library TARGET)
 
     # kernel source file level
     # level 1: base device kernel
-    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+    # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
     # level 2: device-independent kernel
     # - common_srcs
     # level 3: Kernel implemented by reusing device-independent kernel
     # - selected_rows_srcs
+    set(base_device_kernels)
+    set(device_independent_kernel)
+    set(high_level_kernels)
 
-    # Build Target according different src organization
-    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
-        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
-        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
+    # 1. Base device kernel compile
+    if (${cpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_cpu)
+    endif()
+    if (${gpu_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        elseif (WITH_XPU_KP)
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-                xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpu)
+    endif()
+    if (${xpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_xpu)
+    endif()
+    if (${gpudnn_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        elseif (WITH_XPU_KP)
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-                xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If the selected_rows_srcs depends on common_srcs, build target using this rule.
-    elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpudnn)
+    endif()
+    if (${kps_srcs_len} GREATER 0)
+        # only when WITH_XPU_KP, the kps_srcs_len can be > 0
+        xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_kps)
+    endif()
+
+    # 2. Device-independent kernel compile
+    if (${common_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         else()
-            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         endif()
-    # If there are only common_srcs or selected_rows_srcs, build target using below rules.
-    elseif (${common_srcs_len} GREATER 0)
+        list(APPEND device_independent_kernel ${TARGET}_common)
+    endif()
+
+    # 3. Reusing kernel compile
+    if (${selected_rows_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         endif()
-    elseif (${selected_rows_srcs_len} GREATER 0)
+        list(APPEND high_level_kernels ${TARGET}_sr)
+    endif()
+
+    # 4. Unify target compile
+    list(LENGTH base_device_kernels base_device_kernels_len)
+    list(LENGTH device_independent_kernel device_independent_kernel_len)
+    list(LENGTH high_level_kernels high_level_kernels_len)
+    if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
+        ${high_level_kernels_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         else()
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         endif()
     else()
         set(target_build_flag 0)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index ac3eff04d5383ecdf6c771babcaf3e6811600ac3..7df095c6c2ec04e1a694ed2458787af285c96a9a 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE)
     list(APPEND third_party_deps extern_gtest)
 ENDIF()
 
+if(WITH_ONNXRUNTIME)
+    include(external/onnxruntime)            # download, build, install onnxruntime、paddle2onnx
+    include(external/paddle2onnx)          
+    list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
+endif()
+
 if(WITH_GPU)
     if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
         include(external/cub)       # download cub
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index a5b40f8aa07d77e803f2cad36155b7de1bd03719..3fca45cc068f9916b52b3f99df2baa679d4c3546 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,6 +1,13 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
-cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api)
+
+if (WITH_DISTRIBUTE)
+  cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
+endif()
 
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
+if(WITH_ASCEND_CL)
+    cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
+endif()
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
new file mode 100644
index 0000000000000000000000000000000000000000..09789bd4d378630f548f931bcac00fda89ef33be
--- /dev/null
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <error.h>
+#include <string>
+
+#include "boost/variant.hpp"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index, device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index, device_index_));
+
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index, device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index, device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks, int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank,
+                                                       &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index e4f272052024245bf7df7fc841d5e3b18978faf7..e43d0e8c183c7005f31b66c4c29dfc95361485e4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -117,6 +117,35 @@ class ProcessGroup {
         "ProcessGroup%s does not support receive", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors /* tensors */,     // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in /* tensors */,     // NOLINT
+      std::vector<Tensor>& out /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors /* tensors */,  // NOLINT
+      const ReduceOptions& opts) {                 // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<Tensor>& in_tensors /* tensors */,   // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */,  // NOLINT
+      const ScatterOptions&) {                         // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Scatter", GetBackendName()));
+  }
+
  protected:
   const int rank_;
   const int size_;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dc43af117825bf95407255e93e1e4600e8ddd9a
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -0,0 +1,502 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <gloo/broadcast.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+
+#define HOST_NAME_MAX 256
+
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+
+bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kCPU;
+  });
+}
+
+template <typename T>
+T* get_data(const Tensor& tensor) {
+  auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+  return static_cast<T*>(raw_tensor->data());
+}
+
+template <typename T>
+std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
+  std::vector<T*> ret(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret[i] = get_data<T>(tensors[i]);
+  }
+  return ret;
+}
+
+template <typename T, typename P>
+void set_output(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_input(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_outputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs_for_scatter(P& opts,                             // NOLINT
+                            const std::vector<Tensor>& tensors,  // NOLINT
+                            int nranks) {
+  std::vector<T*> ret(nranks);
+  auto raw_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensors[0].impl());
+  T* raw_pointer = reinterpret_cast<T*>(raw_tensor->data());
+  size_t offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    ret[i] = raw_pointer + offset;
+    offset += tensors[0].numel() / nranks;
+  }
+  opts.setInputs(ret, tensors[0].numel() / nranks);
+}
+
+ProcessGroupGloo::GlooTask::GlooTask(int rank,
+                                     const std::vector<Tensor>& inputs,
+                                     CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {
+  PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
+                    platform::errors::Fatal(
+                        "Only CPU place is supported for ProcessGroupGloo."));
+}
+
+ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
+                                   int rank, int world_size,
+                                   const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    const std::vector<Tensor>& inputs, int rank, int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _tag(tag) {}
+
+  void Run() override { _do_broadcast(_inputs[0]); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<Tensor> _inputs{};
+  const uint32_t _tag;
+
+  void _do_broadcast(const Tensor& tensor) {
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = tensor.type();
+    GENERATE_FUNC(dtype, set_output, opts, tensor);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs, ReduceOp reduce_op,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+
+  void Run() override { _do_allreduce(_inputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_allreduce(std::vector<Tensor>& tensors) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, tensors);
+    GENERATE_FUNC(dtype, set_outputs, opts, tensors);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
+                                             opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+
+class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
+      : ProcessGroupGloo::GlooTask(rank, std::vector<Tensor>{},
+                                   CommType::BARRIER),
+        _context(context) {}
+
+  void Run() override { _do_barrier(); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+
+  void _do_barrier() {
+    gloo::BarrierOptions opts(_context);
+    gloo::barrier(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
+    const BarrierOptions& opts) {
+  std::shared_ptr<BarrierGlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<BarrierGlooTask>(rank_, context);
+  task->Run();
+  return task;
+}
+
+class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllgatherGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs,   // NOLINT
+                    std::vector<Tensor>& outputs,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+
+  void Run() override { _do_allgather(_inputs, _outputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  std::vector<Tensor> _outputs;
+  uint32_t _tag;
+
+  void _do_allgather(std::vector<Tensor>& in,     // NOLINT
+                     std::vector<Tensor>& out) {  // NOLINT
+    const auto& dtype = in[0].type();
+    gloo::AllgatherOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, in[0]);
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setTag(_tag);
+    gloo::allgather(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  std::shared_ptr<AllgatherGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<AllgatherGlooTask>(rank_, context, in_tensors,
+                                             out_tensors, tag);
+  task->Run();
+  return task;
+}
+
+class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ReduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                 std::vector<Tensor>& in, ReduceOp reduce_op,  // NOLINT
+                 int dst, uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE),
+        _context(context),
+        _inputs(in),
+        _reduce_op(reduce_op),
+        _dst(dst),
+        _tag(tag) {}
+
+  void Run() override { _do_reduce(_inputs, _dst); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  int _dst;
+  uint32_t _tag;
+
+  gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
+                                          const ReduceOp op) {
+    gloo::ReduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::ReduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_reduce(std::vector<Tensor>& tensors, int dst) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::ReduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, tensors[0]);
+    GENERATE_FUNC(dtype, set_output, opts, tensors[0]);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    opts.setRoot(dst);
+    gloo::reduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  std::shared_ptr<ReduceGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ReduceGlooTask>(rank_, context, tensors,
+                                          opts.reduce_op, opts.root_rank, tag);
+  task->Run();
+  return task;
+}
+
+class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ScatterGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                  std::vector<Tensor>& inputs,   // NOLINT
+                  std::vector<Tensor>& outputs,  // NOLINT
+                  int src, int size, uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _src(src),
+        _size(size),
+        _tag(tag) {}
+
+  void Run() override { _do_scatter(_inputs, _outputs, _src); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  std::vector<Tensor> _outputs;
+  int _src;
+  int _size;
+  uint32_t _tag;
+
+  void _do_scatter(std::vector<Tensor>& in, std::vector<Tensor>& out,  // NOLINT
+                   int src) {
+    const auto& dtype = in[0].type();
+    gloo::ScatterOptions opts(_context);
+    if (rank_ == src) {
+      GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setRoot(src);
+    opts.setTag(_tag);
+    gloo::scatter(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  std::shared_ptr<ScatterGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ScatterGlooTask>(
+      rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
+  task->Run();
+  return task;
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
+                                "Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
new file mode 100644
index 0000000000000000000000000000000000000000..24f156571a427128f09cd28e632212f47fa4cd47
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <future>
+#include <mutex>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+
+namespace paddle {
+namespace distributed {
+
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
+                      CommType comm_type);
+
+    ~GlooTask() = default;
+
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+
+   protected:
+    friend class ProcessGroupGloo;
+  };
+
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(
+        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+        : _store(store) {}
+
+    ~GlooStore() = default;
+
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+
+   protected:
+    std::shared_ptr<paddle::distributed::TCPStore> _store;
+  };
+
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+
+  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
+                            int world_size,
+                            std::shared_ptr<GlooOptions> options);
+
+  ~ProcessGroupGloo() = default;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& inputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& inputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<GlooStore> _store;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2deeb7ca03003d0b6c8fa0948afa0a3394639f8b
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -0,0 +1,354 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+
+DECLARE_bool(hccl_blocking_wait);
+// DECLARE_bool(use_stream_safe_npu_allocator);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+static HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(), true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
+//   return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+//     return t.place() == platform::DeviceType::NPU;
+//   });
+// }
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<NPUEventManager>& hcclEvents,                   // NOLINT
+    std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    hcclEvents[i].Record(*dev_ctx[i]);
+    hcclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
+    std::vector<Place> places, int rank, CommType comm_type,
+    const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupHCCL::HCCLTask>(places, rank, comm_type,
+                                                      inputs);
+}
+
+ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector<Place>& places, int rank,
+                                     CommType CommType,
+                                     const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  hcclComms_.resize(places.size());
+}
+
+ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
+
+void ProcessGroupHCCL::HCCLTask::SetOutputs(
+    std::vector<Tensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
+}
+
+void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    platform::NPUStreamWaitEvent(default_ctx->stream(),
+                                 control_events_[i].GetRawNPUEvent());
+  }
+}
+
+bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO(sandyhouse): Add timeout for wait, now timeout unused
+bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                                   int rank, int size)
+    : ProcessGroup(rank, size), store_(store) {}
+
+void ProcessGroupHCCL::BroadcastUniqueHCCLID(
+    std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto hccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
+      store_->set(key, hccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&hccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+
+// create HCCLManager cache for places_key
+void ProcessGroupHCCL::CreateHCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+
+  std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
+  hccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<HcclRootInfo> hccl_ids;
+  hccl_ids.resize(1);
+  auto& hccl_id = hccl_ids.front();
+
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
+  }
+  BroadcastUniqueHCCLID(hccl_ids);
+
+  VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
+
+  std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id,
+                                            comms.get() + i);
+    dev_ctx[i].reset(new NPUDeviceContext(places[i]));
+  }
+
+  std::vector<NPUEventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
+    std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < inputs.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::PointToPoint(
+    std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, tensors);
+
+  // construct uninitialize guard for device
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < tensors.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank);
+  }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     NPUPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclAllReduce(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     CudaPlace."));
+
+  return Collective(
+      tensors, tensors,
+      [&](Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        const auto root = opts.source_rank * tensors.size() + opts.source_root;
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclBroadcast(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
new file mode 100644
index 0000000000000000000000000000000000000000..83d509be2cdd7b79faf4e2a2f510c34361b94157
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+constexpr const char* HCCL_BACKEND_NAME = "HCCL";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+using NPUStream = platform::stream::NPUStream;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class HCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<HCCLTask> {
+   public:
+    HCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
+             const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams();
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize();
+
+    void SetOutputs(std::vector<Tensor>& outputs);  // NOLINT
+
+    virtual ~HCCLTask();
+
+    std::vector<NPUEventManager> control_events_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
+    std::shared_ptr<std::vector<Tensor>> outputs_;
+
+   private:
+  };
+
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size);
+
+  const std::string GetBackendName() const override {
+    return std::string(HCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
+      std::vector<Place> places, int rank, CommType opType,
+      const std::vector<Tensor>& inputs);
+
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<HCCLCommManager> hccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
+      places_to_hcclcomm_;
+
+  std::unordered_map<std::string, std::vector<NPUEventManager>>
+      places_to_events_;
+
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<NPUDeviceContext>>>
+      places_to_ctx_;
+
+  std::set<int> used_place_ids_;
+
+ private:
+  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<Tensor>& inputs,   // NOLINT
+      std::vector<Tensor>& outputs,  // NOLINT
+      Fn fn, CommType op_type);
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<Tensor>& tensors,  // NOLINT
+      Fn fn, int dst_rank, CommType op_type);
+
+  void CreateHCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 5d96e730aa4b1aeae3fc242ca43f63d909325a4e..7f21bcee87ab705097d3c2beaf799e5f2d93b833 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -88,8 +88,8 @@ void SyncDefaultStream(
   for (size_t i = 0; i < places.size(); ++i) {
     auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
         platform::DeviceContextPool::Instance().Get(places[i]));
-    ncclEvents[i].Record(*dev_ctx[i]);
-    ncclEvents[i].Block(*default_ctx);
+    ncclEvents[i].Record(*default_ctx);
+    ncclEvents[i].Block(*dev_ctx[i]);
   }
 }
 
@@ -156,36 +156,27 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
 // Same as Wait
 void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
-ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
+ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
                                    int rank, int size)
-    : ProcessGroup(rank, size), strategy_(strategy) {}
-
-void ProcessGroupNCCL::BcastNCCLId(
-    std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
-    int root, int server_fd) {
-  if (strategy_.local_rank_ == root) {
-    std::vector<std::string> other_trainers;
-    for (auto& ep : strategy_.trainer_endpoints_) {
-      if (ep != strategy_.current_endpoint_) {
-        other_trainers.push_back(ep);
-      }
-    }
-    platform::SendBroadCastCommID(other_trainers, &nccl_ids);
-  } else {
-    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
-                                  &nccl_ids);
-  }
-}
+    : ProcessGroup(rank, size), store_(store) {}
 
 void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
-
-  int server_fd = -1;
-  if (rank_ != 0) {
-    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
-                    .socket();
+  if (rank_ == 0) {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto nccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
+      store_->set(key, nccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&nccl_ids[i], ret.data(), ret.size());
+    }
   }
-  BcastNCCLId(nccl_ids, 0, server_fd);
 }
 
 // create NCCLManager cache for places_key
@@ -213,8 +204,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   }
   BroadcastUniqueNCCLID(nccl_ids);
 
-  VLOG(3) << "init nccl rank: " << strategy_.local_rank_
-          << ", nranks: " << strategy_.nranks_ << ", place: " << places_key
+  VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
           << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
 
   std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
@@ -473,5 +464,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclAllGather(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), comm, stream);
+      },
+      CommType::ALLGATHER);
+}
+
+void* GetPointerByOffset(void* raw_pointer, size_t offset,
+                         experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          offset += input_tensor->numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input_tensor->data(), output_tensor->data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
+      },
+      CommType::REDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input_tensor->data(), offset, input.type()),
+                input_tensor->numel() / size_,
+                platform::ToNCCLDataType(input.type()), i, comm, stream));
+            offset += input_tensor->numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+        }
+      },
+      CommType::SCATTER);
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index cfeb6467f0dbf21f116b1880f8b64a55bb2314a1..aa2a2b8fa2088cd30729ba5e6184ef7a9c507bf3 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -75,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup {
    private:
   };
 
-  ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size);
 
   const std::string GetBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
@@ -98,13 +99,27 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
                                            int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places, int rank, CommType opType,
       const std::vector<Tensor>& inputs);
 
  protected:
-  ProcessGroupStrategy strategy_;
+  std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
   std::mutex mutex_;
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
index 699222ac452dbcc2f0b1b41c70c6036dc915a427..973f7c643542757c0bce68f8ccdefeadc97f15d4 100644
--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -36,5 +36,14 @@ struct BarrierOptions {
   std::vector<int> place_ids;
 };
 
+struct ReduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+  int root_rank = 0;
+};
+
+struct ScatterOptions {
+  int root_rank = 0;
+};
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 59f3ea3b0a7d85651e7780b4b11875f19b70931e..5533f3f4cbf4b136c52b35cb74afefb86cbe73d7 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/reducer.h"
-#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace distributed {
@@ -127,5 +126,430 @@ std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
   return res;
 }
 
+template <typename DeviceContext, typename T>
+static void ConcatTensorsForAllReduce(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents) {
+  operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
+  concat_functor_(
+      context, dense_tensors_, 0,
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get());
+}
+
+template <typename DeviceContext, typename T>
+static void SplitTensorsForAllReduce(
+    const DeviceContext &context, Tensor *p_dense_contents,
+    std::vector<phi::DenseTensor> *p_dense_tensors) {
+  auto *in =
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get();
+  std::vector<phi::DenseTensor *> outs;
+  std::vector<const phi::DenseTensor *> shape_refer;
+
+  outs.reserve(p_dense_tensors->size());
+  shape_refer.reserve(p_dense_tensors->size());
+
+  for (auto &tensor : *p_dense_tensors) {
+    outs.emplace_back(&tensor);
+    shape_refer.emplace_back(&tensor);
+  }
+
+  operators::math::SplitFunctor<DeviceContext, T> split_functor_;
+  split_functor_(context, *in, shape_refer, 0, &outs);
+}
+
+// context is used to select the stream for concat
+template <typename DeviceContext>
+static void ConcatTensorsWithType(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents, phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      ConcatTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case phi::DataType::FLOAT32:
+      ConcatTensorsForAllReduce<DeviceContext, float>(context, dense_tensors_,
+                                                      p_dense_contents);
+      break;
+    case phi::DataType::FLOAT64:
+      ConcatTensorsForAllReduce<DeviceContext, double>(context, dense_tensors_,
+                                                       p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          type));
+  }
+}
+
+// context is used to select the stream for split
+template <typename DeviceContext>
+static void SplitTensorsWithType(const DeviceContext &context,
+                                 Tensor *p_dense_contents,
+                                 std::vector<phi::DenseTensor> *p_dense_tensors,
+                                 phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      SplitTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT32:
+      SplitTensorsForAllReduce<DeviceContext, float>(context, p_dense_contents,
+                                                     p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT64:
+      SplitTensorsForAllReduce<DeviceContext, double>(context, p_dense_contents,
+                                                      p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          type));
+  }
+}
+
+void EagerGroup::ConcatTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
+                          dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat grad tensors since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
+                          dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Concat grad tensor not supported on place (%s)", place));
+  }
+}
+
+void EagerGroup::SplitTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
+                         dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split grad tensor since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
+                         dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split grad tensor not supported on place (%s)", place));
+  }
+}
+
+EagerReducer::EagerReducer(
+    const std::vector<Tensor> tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits, bool find_unused_parameters)
+    : tensors_(tensors),
+      group_indices_(group_indices),
+      is_sparse_gradient_(is_sparse_gradient),
+      process_group_(process_group),
+      group_size_limits_(group_size_limits),
+      find_unused_vars_each_step_(find_unused_parameters) {
+  VLOG(3) << "Start construct the Reducer ...";
+
+  nranks_ = process_group_->GetSize();
+
+  // initialize groups
+  InitializeGroups(group_indices);
+
+  for (size_t global_var_index = 0; global_var_index < tensors_.size();
+       ++global_var_index) {
+    auto tensor = tensors_[global_var_index];
+    auto reduce_hook = [=](void) -> void {
+      this->AddDistHook(global_var_index);
+    };
+
+    const auto &grad_node = GetGradNodeFromTensor(&tensor);
+
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    const auto &accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    accumulation_grad_node->RegisterReduceHook(
+        std::make_shared<egr::CppTensorVoidHook>(reduce_hook));
+  }
+
+  vars_marked_ready_.resize(tensors_.size(), false);
+  local_used_vars_.resize(tensors_.size(), 0);
+}
+
+std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
+    Tensor *tensor) {
+  auto *autograd_meta = tensor->get_autograd_meta();
+  const auto &grad_node =
+      static_cast<egr::AutogradMeta *>(autograd_meta)->GetMutableGradNode();
+  return grad_node;
+}
+
+void EagerReducer::InitializeGroups(
+    const std::vector<std::vector<size_t>> &group_indices) {
+  VLOG(3) << "Start initialize groups ..";
+
+  // clear the group
+  groups_.clear();
+  groups_.reserve(group_indices.size());
+
+  variable_locators_.clear();
+  variable_locators_.resize(tensors_.size());
+
+  auto group_nums = group_indices.size();
+  for (size_t group_index = 0; group_index < group_nums; ++group_index) {
+    const auto &tensor_indices_ = group_indices[group_index];
+    PADDLE_ENFORCE_GT(
+        tensor_indices_.size(), 0,
+        platform::errors::PreconditionNotMet(
+            "The number of group[%d]'s elements is 0.", group_index));
+
+    EagerGroup group;
+
+    // It's just for check the sparse or dense
+    auto first_var = tensors_[tensor_indices_.front()];
+    if (tensor_indices_.size() == 1 &&
+        is_sparse_gradient_[tensor_indices_.front()]) {
+      // process the sparse gradient. one sparse, one group
+      group.dtype_ = first_var.dtype();
+    } else {
+      // process the dense gradient.
+      InitializeDenseGroups(tensor_indices_, &group);
+      experimental::Backend backend;
+      switch (inner_place_.GetType()) {
+        case phi::AllocationType::GPU:
+          backend = experimental::Backend::GPU;
+          break;
+        case phi::AllocationType::CPU:
+          backend = experimental::Backend::CPU;
+          break;
+        default:
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Place type (%s) is not supported. ", inner_place_));
+          break;
+      }
+      group.dense_contents_ = paddle::experimental::empty(
+          ScalarArray({group.all_length_}), group.dtype_, backend);
+    }
+
+    // map tensors to this group by VariableLocator
+    size_t inside_group_index = 0;
+    for (const auto var_index : tensor_indices_) {
+      TensorLocator tensor_locator;
+      tensor_locator.group_index = group_index;
+      tensor_locator.inside_group_index = inside_group_index++;
+      variable_locators_[var_index] = tensor_locator;
+    }
+    group.tensor_indices_ = std::move(tensor_indices_);
+    groups_.emplace_back(std::move(group));
+
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
+  }
+}
+
+void EagerReducer::InitializeDenseGroups(
+    const std::vector<size_t> &tensor_indices_, EagerGroup *p_group) {
+  VLOG(3) << "InitializeDenseGroups.";
+  int64_t all_length = 0;
+  for (size_t index = 0; index < tensor_indices_.size(); ++index) {
+    auto tensor_index = tensor_indices_[index];
+    auto &tensor = tensors_[tensor_index];
+    auto &tensor_name = tensor.name();
+
+    PADDLE_ENFORCE_EQ(tensor.is_initialized(), true,
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s is not initialized.", tensor_name));
+    const auto size = tensor.numel();
+    PADDLE_ENFORCE_GT(
+        size, 0, platform::errors::PreconditionNotMet(
+                     "The number of tensor %s's elements is 0.", tensor_name));
+    all_length += size;
+
+    p_group->length_.push_back(size);
+
+    // for concat operator
+    p_group->origin_shapes_.push_back(ScalarArray(tensor.shape()));
+    p_group->dense_tensors_.push_back(phi::DenseTensor());
+
+    const auto &dtype = tensor.dtype();
+    const auto &place = tensor.place();
+    const auto &inner_place = tensor.impl()->place();
+    if (index > 0) {
+      PADDLE_ENFORCE_EQ(dtype, p_group->dtype_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has unexpected dtype.", tensor_name));
+      PADDLE_ENFORCE_EQ(place, place_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has different place. Expected place is "
+                            "%s, but actual place is %s",
+                            tensor_name, inner_place_, inner_place));
+    } else {
+      p_group->dtype_ = dtype;
+      place_ = place;
+      inner_place_ = inner_place;
+    }
+  }
+  p_group->all_length_ = all_length;
+}
+
+void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  grad_need_hooks_ = true;
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) {
+    group.pending_ = group.tensor_indices_.size();
+  });
+
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(tensors_.size(), false);
+}
+
+void EagerReducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(), var_index));
+
+  // gradient synchronization is not required when grad_need_hooks_ is false.
+  if (!grad_need_hooks_) {
+    return;
+  }
+
+  auto &tensor = tensors_[var_index];
+  const auto &grad_node = GetGradNodeFromTensor(&tensor);
+
+  VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name()
+          << "] arrived and triggered disthook";
+
+  local_used_vars_[var_index] = 1;
+
+  MarkVarReady(var_index, true);
+}
+
+void EagerReducer::MarkVarReady(const size_t var_index,
+                                const bool is_used_var) {
+  const auto &var_locator = variable_locators_[var_index];
+  const auto group_index = var_locator.group_index;
+  const auto inside_group_index = var_locator.inside_group_index;
+
+  auto &group = groups_[group_index];
+  auto &group_tensor = group.dense_tensors_[inside_group_index];
+  auto *autograd_meta = tensors_[var_index].get_autograd_meta();
+  auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
+
+  group_tensor
+      .ShareDataWith(
+          *(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
+      .Resize({grad_tensor.numel()});
+
+  vars_marked_ready_[var_index] = true;
+
+  if (--group.pending_ == 0) {
+    // can start allreduce
+    MarkGroupReady(group_index);
+  }
+}
+
+void EagerReducer::MarkGroupReady(size_t group_index) {
+  VLOG(3) << "Group[" << group_index << "] is ready";
+
+  PADDLE_ENFORCE_GE(
+      group_index, next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_, group_index));
+
+  if (group_index > next_group_) {
+    VLOG(3) << "It will adjust the order of group in next batch automatically";
+    return;
+  }
+
+  for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
+       ++next_group_) {
+    UNUSED auto &group = groups_[next_group_];
+    FusedAllReduceSchedule(&group, next_group_);
+  }
+}
+
+void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
+                                          const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+
+  VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce.";
+
+  // concat tensors
+  group->ConcatTensors(inner_place_);
+
+  // div nranks
+  double scaling = 1.0 / nranks_;
+  paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false);
+
+  // all_reduce
+  std::vector<Tensor> reduce_tensors = {group->dense_contents_};
+  tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts));
+
+  if (tasks_.size() == groups_.size()) {
+    for (size_t index = 0; index < tasks_.size(); index++) {
+      auto &task = tasks_.back();
+      task->Synchronize();
+      tasks_.pop_back();
+    }
+    for (size_t index = 0; index < groups_.size(); index++) {
+      auto &group = groups_[index];
+      group.SplitTensors(inner_place_);
+    }
+  }
+}
+
+std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
+  const auto &tensors_ = group.tensor_indices_;
+  out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size()
+      << "\n";
+  auto begin = tensors_.begin();
+  auto end = tensors_.end();
+  out << "[";
+  for (int i = 0; begin != end && i < 100; ++i, ++begin) {
+    if (i > 0) out << ' ';
+    out << *begin;
+  }
+  if (begin != end) {
+    out << " ...";
+  }
+  out << "]\n";
+  return out;
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index f8c75385ef8bd6891df8eda6faa93c73091c37f5..ac6f3fbe5956cd47d4385343509d41afec0b69a4 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -17,16 +17,109 @@
 #include <map>
 #include <vector>
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace distributed {
 using Tensor = paddle::experimental::Tensor;
+using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
+using ScalarArray =
+    paddle::experimental::ScalarArrayBase<paddle::experimental::Tensor>;
 
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
-    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
-    const std::vector<size_t>& group_size_limits,
-    const std::vector<int64_t>& tensor_indices = {});
+    const std::vector<Tensor>, const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices = {});
+
+class EagerGroup {
+ public:
+  Tensor dense_contents_;
+
+  // for concat kernel
+  std::vector<phi::DenseTensor> dense_tensors_;
+  std::vector<int64_t> length_;
+  int64_t all_length_{0};
+  std::vector<ScalarArray> origin_shapes_;
+
+  // Global indices of participating tensors in the group
+  std::vector<size_t> tensor_indices_;
+
+  // Number of params that haven't been ready. When it is 0, it means
+  // the group is ready.
+  size_t pending_ = -1;
+
+  // external message of group
+  phi::DataType dtype_;
+
+  // context is used to select the stream for concat
+  void ConcatTensors(const platform::Place &);
+
+  // context is used to select the stream for split
+  void SplitTensors(const platform::Place &);
+
+  friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
+};
+
+struct TensorLocator {
+  // record the index in groups_
+  size_t group_index;
+  size_t inside_group_index;
+};
+
+class EagerReducer {
+ public:
+  explicit EagerReducer(
+      const std::vector<Tensor> tensors,
+      const std::vector<std::vector<size_t>> &group_indices,
+      const std::vector<bool> &is_sparse_gradient,
+      std::shared_ptr<distributed::ProcessGroup> process_group,
+      const std::vector<size_t> &group_size_limits,
+      bool find_unused_parameters);
+
+  virtual ~EagerReducer() {}
+
+  std::shared_ptr<egr::GradNodeBase> GetGradNodeFromTensor(Tensor *tensor);
+
+  void InitializeGroups(const std::vector<std::vector<size_t>> &group_indices);
+  void InitializeDenseGroups(const std::vector<size_t> &tensor_indices_,
+                             EagerGroup *p_group);
+  void PrepareForBackward(const std::vector<Tensor> &outputs);
+  void AddDistHook(size_t var_index);
+  void MarkVarReady(const size_t var_index, const bool is_used_var);
+  void MarkGroupReady(const size_t group_index);
+  void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
+
+ private:
+  std::vector<Tensor> tensors_;
+  std::vector<std::vector<size_t>> group_indices_;
+  std::vector<bool> is_sparse_gradient_;
+  std::shared_ptr<distributed::ProcessGroup> process_group_;
+  std::vector<size_t> group_size_limits_;
+  bool find_unused_vars_each_step_;
+
+  std::vector<EagerGroup> groups_;
+  std::vector<TensorLocator> variable_locators_;
+  PlaceType place_;
+  platform::Place inner_place_;
+  size_t next_group_ = 0;
+  int64_t nranks_ = -1;
+  std::vector<std::shared_ptr<paddle::distributed::ProcessGroup::Task>> tasks_;
+
+  bool grad_need_hooks_{false};
+
+  std::vector<bool> vars_marked_ready_;
+  std::vector<int> local_used_vars_;
+};
 
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 18920d06f38543cc3f7aeb045e7c3058143e006e..ba039385a74ba45aa1f33ba38138d8e5213f2e00 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -24,10 +24,14 @@ limitations under the License. */
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(fill_constant);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h
index f46e659a88babb07918d02f1e05859829895f2bf..5ac0c08f97d76f6bc1cb77f1f6cd0da77be2385f 100644
--- a/paddle/fluid/distributed/ps/table/depends/initializers.h
+++ b/paddle/fluid/distributed/ps/table/depends/initializers.h
@@ -23,7 +23,6 @@
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/framework/generator.h"
-
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
@@ -118,9 +117,13 @@ class TruncatedGaussianInitializer : public Initializer {
     seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
     mean_ = std::stof(attrs[2]);
     std_ = std::stof(attrs[3]);
-
-    std::uniform_real_distribution<float> dist_(
-        std::numeric_limits<float>::min(), 1.0);
+    auto normal_cdf = [](float x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_);
+    float b_normal_cdf = normal_cdf((2.0 - mean_) / std_);
+    std::uniform_real_distribution<float> dist_(2.0 * a_normal_cdf - 1.0,
+                                                2.0 * b_normal_cdf - 1.0);
     random_engine_ = framework::GetCPURandomEngine(seed_);
   }
 
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
index 2673314d222d2b32e42c42a3a94df71a1887914a..7b4ae7e70ff6f033e038f1c5214f46e0876257d2 100644
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -25,13 +25,26 @@ namespace distributed {
 
 class Store {
  public:
-  Store() = delete;
+  Store() : _timeout(tcputils::kNoTimeout) {}
   explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
   virtual ~Store() = default;
 
-  virtual int64_t add(const std::string& key, int64_t value) = 0;
-  virtual std::vector<uint8_t> get(const std::string& key) = 0;
-  virtual void wait(const std::string& key) = 0;
+  virtual int64_t add(const std::string& key, int64_t value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual std::vector<uint8_t> get(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void wait(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void set(const std::string& key, const std::vector<uint8_t>& value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
 
   virtual const std::chrono::seconds& timeout() const { return _timeout; }
 
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index de85ac0d910e93257a308052ca1fcf193680a183..b0d5add49565ffb19762778ddd44a388b140c0ee 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -27,11 +27,13 @@ namespace detail {
 
 constexpr int INFTIME = -1;
 
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
-  return std::make_unique<MasterDaemon>(socket);
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
+                                                  int nranks) {
+  return std::make_unique<MasterDaemon>(socket, nranks);
 }
 
-MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks)
+    : _listen_socket(socket), _nranks(nranks) {
   _background_thread = std::thread{&MasterDaemon::run, this};
 }
 
@@ -64,27 +66,35 @@ void MasterDaemon::_do_add(SocketType socket) {
   tcputils::send_value<int64_t>(socket, new_value);
 }
 
+void MasterDaemon::_do_set(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_set";
+  std::string key = tcputils::receive_string(socket);
+  auto value = tcputils::receive_vector<uint8_t>(socket);
+  _store[key] = value;
+}
+
 void MasterDaemon::_do_get(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_get";
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
   PADDLE_ENFORCE_NE(
       iter, _store.end(),
       platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
   std::vector<uint8_t> value = iter->second;
-  VLOG(3) << "TCPStore: value ("
-          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
-                                    value.size()))
-          << ") for key (" << key << ").";
   tcputils::send_vector<uint8_t>(socket, value);
 }
 
 void MasterDaemon::_do_stop(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_stop";
   ReplyType value = ReplyType::STOP_WAIT;
-  _stop = true;
   tcputils::send_value<ReplyType>(socket, value);
+  if (--_nranks == 0) {
+    _stop = true;
+  }
 }
 
 void MasterDaemon::_do_wait(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_wait";
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
   auto reply = ReplyType::STOP_WAIT;
@@ -126,35 +136,47 @@ void MasterDaemon::run() {
     }
 
     for (size_t i = 1; i < fds.size(); i++) {
-      if (fds[i].revents == 0) {
-        continue;
-      }
-
-      Command command = tcputils::receive_value<Command>(fds[i].fd);
-      VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
-
-      switch (command) {
-        case Command::ADD:
-          _do_add(fds[i].fd);
-          break;
-        case Command::GET:
-          _do_get(fds[i].fd);
-          break;
-        case Command::WAIT:
-          _do_wait(fds[i].fd);
-          break;
-        case Command::STOP:
-          _do_stop(fds[i].fd);
-          break;
+      try {
+        if (fds[i].revents == 0) {
+          continue;
+        }
+
+        Command command = tcputils::receive_value<Command>(fds[i].fd);
+        VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command)
+                << ".";
+
+        switch (command) {
+          case Command::ADD:
+            _do_add(fds[i].fd);
+            break;
+          case Command::GET:
+            _do_get(fds[i].fd);
+            break;
+          case Command::SET:
+            _do_set(fds[i].fd);
+            break;
+          case Command::WAIT:
+            _do_wait(fds[i].fd);
+            break;
+          case Command::STOP:
+            _do_stop(fds[i].fd);
+            break;
+          default:
+            VLOG(0) << "Unknow command: " << static_cast<int>(command);
+            exit(-1);
+        }
+      } catch (...) {
+        fds.erase(fds.begin() + i);
+        _sockets.erase(_sockets.begin() + i - 1);
       }
     }
   }
 }
 
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
   int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
   auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket);
+  server->_master_daemon = MasterDaemon::start(socket, nranks);
   return server;
 }
 
@@ -200,7 +222,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
                    size_t num_workers, std::chrono::seconds timeout)
     : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
   if (_is_master) {
-    _server = detail::TCPServer::create(port);
+    _server = detail::TCPServer::create(port, num_workers);
   }
 
   _client = detail::TCPClient::connect(host, port);
@@ -213,36 +235,41 @@ void TCPStore::waitWorkers() {
   }
   add(_init_key, 1);
 
-  if (_server) {
-    auto begin = std::chrono::steady_clock::now();
-    do {
-      auto value = get(_init_key);
-      int completed = std::stoi(std::string(value.begin(), value.end()));
-      VLOG(3) << completed << " worker ready, total " << _num_workers;
-      if (completed >= _num_workers) {
-        break;
-      }
-      const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
-          std::chrono::steady_clock::now() - begin);
-
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
-        PADDLE_ENFORCE_EQ(
-            completed, _num_workers,
-            platform::errors::InvalidArgument(
-                "TCPStore timeouted and not all workers got ready."));
-      }
-    } while (true);
-  }
+  auto begin = std::chrono::steady_clock::now();
+  do {
+    auto value = get(_init_key);
+    int completed = std::stoi(std::string(value.begin(), value.end()));
+    VLOG(3) << completed << " worker ready, total " << _num_workers;
+    if (completed >= _num_workers) {
+      break;
+    }
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - begin);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
+      PADDLE_ENFORCE_EQ(
+          completed, _num_workers,
+          platform::errors::InvalidArgument(
+              "TCPStore timeouted and not all workers got ready."));
+    }
+  } while (true);
   VLOG(3) << "TCPStore initialized.";
 }
 
 int64_t TCPStore::add(const std::string& key, int64_t value) {
+  VLOG(3) << "TCPStore add.";
   _client->send_command_for_key(Command::ADD, _key_prefix + key);
   _client->send_value<std::int64_t>(value);
   return _client->receive_value<std::int64_t>();
 }
 
+void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
+  VLOG(3) << "TCPStore set.";
+  _client->send_command_for_key(Command::SET, _key_prefix + key);
+  _client->send_vector<std::uint8_t>(value);
+}
+
 std::vector<uint8_t> TCPStore::get(const std::string& key) {
   wait(key);
   _client->send_command_for_key(Command::GET, _key_prefix + key);
@@ -252,6 +279,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 
 void TCPStore::wait(const std::string& key) {
   ReplyType reply;
+  VLOG(3) << "TCPStore wait.";
   do {
     _client->send_command_for_key(Command::WAIT, _key_prefix + key);
 
@@ -261,6 +289,7 @@ void TCPStore::wait(const std::string& key) {
 }
 
 TCPStore::~TCPStore() {
+  VLOG(3) << "~TCPStore";
   _client->send_command_for_key(Command::STOP, "");
   ReplyType ret = _client->receive_value<ReplyType>();
   PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
index cd706dd6640acf5e0b5b3714175dac7a6cecb25a..17c1d8ea30a421f04d054d59ac93c8c60406ef68 100644
--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -27,15 +27,16 @@ namespace paddle {
 namespace distributed {
 
 enum class ReplyType { WAITING, STOP_WAIT };
-enum class Command { ADD, GET, WAIT, STOP };
+enum class Command { ADD, GET, SET, WAIT, STOP };
 
 namespace detail {
 
 class MasterDaemon {
  public:
-  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
+                                             int nranks);
   MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket);
+  explicit MasterDaemon(SocketType listen_socket, int nranks);
   ~MasterDaemon();
 
  private:
@@ -43,18 +44,20 @@ class MasterDaemon {
   void _do_add(SocketType socket);
   void _do_wait(SocketType socket);
   void _do_get(SocketType socket);
+  void _do_set(SocketType socket);
   void _do_stop(SocketType socket);
   SocketType _listen_socket;
   std::vector<SocketType> _sockets;
   std::unordered_map<std::string, std::vector<uint8_t>> _store;
   std::thread _background_thread{};
+  int _nranks;
   bool _stop = false;
 };
 
 class TCPServer {
  public:
   TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
 
  private:
   std::unique_ptr<MasterDaemon> _master_daemon;
@@ -97,6 +100,7 @@ class TCPStore : public Store {
   int64_t add(const std::string& key, int64_t value) override;
   std::vector<uint8_t> get(const std::string& key) override;
   void wait(const std::string& key) override;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
  private:
   void waitWorkers();
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
index d0561d0b9a9c5b01c32620e72d21ed562e42637e..a28cba288333d7f1c2a705049c29b59f43a70cc5 100644
--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
   hints.ai_socktype = SOCK_STREAM;
 
   const char* node = host.empty() ? nullptr : host.c_str();
+  const char* port_cstr = port.empty() ? nullptr : port.c_str();
 
   int n;
-  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
   const char* proto =
       (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 8cb69caf66369655ce751163420b3fcec80dd833..691a381405e9a792d1ee0f256647405a3739e9d8 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,6 +1,7 @@
-set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
+
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
-set(generated_deps dygraph_function dygraph_node)
+set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     message("Performing Eager Dygraph Auto Code Generation")
@@ -9,12 +10,14 @@ endif()
 
 add_subdirectory(api)
 add_subdirectory(accumulation)
+add_subdirectory(custom_operator)
+
 
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
-cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
+cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 734cabdc3dc914349e2ad30b657bfb6542a7472a..07fa40165167ce2352018c0e1b1cb08222d5a181 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
   explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    VLOG(6) << "Construct GradNodeAccumulation";
     weak_grad_ = meta->WeakGrad();
     SetDefaultGradInOutMeta();
   }
 
-  ~GradNodeAccumulation() override = default;
+  ~GradNodeAccumulation() override {
+    VLOG(6) << "Destruct GradNodeAccumulation";
+  }
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index c0150a1730d52b3410ba4ea0d31674fbfed596ae..247fde6ed1f869542969b068cdae9f59cedd732a 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
       const std::vector<paddle::experimental::Tensor>& tensors);
 
   void SetAttributes_scale(float scale);
-
+  std::string name() override { return ""; }
   // Members: define fwd input tensors
   // For Scale there is no fwd input tensor needed
  private:
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 00578d9a359a3b8d57148efc959de553e811f541..a9a62fcd50e7a0648e695d1f60d52d3f936c53ed 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -18,7 +18,7 @@
 #include <atomic>
 #include <memory>
 #include "paddle/fluid/imperative/tracer.h"
-
+#include "paddle/phi/api/ext/op_meta_info.h"
 namespace egr {
 
 class UniqueNameGenerator {
@@ -70,6 +70,21 @@ class Controller {
 
   void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; }
 
+  const std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>&
+  GetOpMetaInfoMap() {
+    return op_meta_info_map_;
+  }
+
+  void MergeOpMetaInfoMap(const std::unordered_map<
+                          std::string, std::vector<paddle::OpMetaInfo>>& map) {
+    op_meta_info_map_.insert(map.begin(), map.end());
+  }
+
+  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  GetCustomEdgesSlotMap() {
+    return custom_edges_slot_map_;
+  }
+
  private:
   Controller() = default;
   static Controller* controller_;
@@ -77,6 +92,11 @@ class Controller {
       new paddle::imperative::Tracer()};
   // TODO(jiabin): remove when we don't need imperative.
   bool in_eager_mode_{false};
+  std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
+      op_meta_info_map_;
+  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
+  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+      custom_edges_slot_map_;
   DISABLE_COPY_AND_ASSIGN(Controller);
 };
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 102fad56373803a19f07afc7dda72e9704ac83d5..6a2e5e7ac6cd75068bba4e9b675ab67588c38366 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
+/* --- Black Ops list that's NO NEED to apply code generation --- */
+static std::unordered_set<std::string> black_ops_list = {"run_program"};
+
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
   std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
@@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type,
 }
 
 static void PrepareAttrMapForOps() {
-  // Handle "run_program_op"
-  static framework::ProgramDesc fake_prog;
-  operators_with_attrs["run_program"] = {};
-  operators_with_attrs["run_program"]["global_block"] =
-      fake_prog.MutableBlock(0);
-
   // Handle "fused_elemwise_add_activation"
   std::vector<std::string> functor_list = {"a", "b"};
   operators_with_attrs["fused_elemwise_add_activation"] = {};
@@ -996,6 +993,29 @@ static std::string GenerateGradNodeCreationContent(
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
   std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  // If single output slotname and not duplicable,
+  // then generate: "egr::AutogradMeta* p_autograd_out =
+  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    const std::string& output_autograd_name = "p_autograd_" + output_name;
+
+    if (output.duplicable()) {
+      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
+          "  std::vector<egr::AutogradMeta*> %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    } else {
+      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+          "  egr::AutogradMeta* %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    }
+  }
+  VLOG(6) << "Generated outputs autograd_meta";
+
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,31 +1044,6 @@ static std::string GenerateGradNodeCreationContent(
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
-  // If single output slotname and not duplicable,
-  // then generate: "egr::AutogradMeta* p_autograd_out =
-  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : out_vars) {
-    const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-
-    // Skip Intermediate Tensor
-
-    if (output.duplicable()) {
-      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    }
-  }
-  VLOG(6) << "Generated outputs autograd_meta";
-
   std::string prepare_autograd_meta_str = "";
   prepare_autograd_meta_str += get_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
@@ -1204,11 +1199,12 @@ static std::string GenerateGradNodeCreationContent(
       "  %s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
       "  if(require_any_grad) {\n"
+      "    VLOG(6) << \" Construct Grad for %s \"; \n"
       "    egr::EagerUtils::PassStopGradient(%s);\n"
       "%s\n  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, pass_stop_gradient_args,
+      compute_require_grad_args, op_type, pass_stop_gradient_args,
       grad_node_creation_str);
 
   return grad_node_creation_body_str;
@@ -1557,9 +1553,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   core_ops_returns_info[op_type] = return_contents;
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
+
   if (!bwd_info.GenerateForwardOnly()) {
     std::string grad_node_creation_body_str =
         GenerateGradNodeCreationContent(fwd_info, bwd_info);
+
+    // Add event record
+    std::string event_name = op_type + " node_creation";
+    const char* NODE_CREATION_TEMPLATE =
+        "{\n"
+        "   paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
+        "paddle::platform::TracerEventType::Operator, 1);\n"
+        "   %s\n"
+        "}";
+
+    grad_node_creation_body_str = paddle::string::Sprintf(
+        NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str);
+
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
 
@@ -1618,10 +1628,20 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     if ((*iter) == ',') dygraph_function_args_str.erase(iter);
   }
 
-  const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n";
+  const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE =
+      "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);";
+  std::string event_name = op_type + " dygraph";
+  std::string fwd_record_event_str = paddle::string::Sprintf(
+      DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name);
+  const char* FWD_FUNCTION_TEMPLATE =
+      "%s %s(%s) {\n\n"
+      " %s\n"
+      " %s\n"
+      "}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
-      dygraph_function_args_str, generated_function_body);
+      dygraph_function_args_str, fwd_record_event_str, generated_function_body);
 
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
@@ -2083,22 +2103,24 @@ static std::string GenerateGradNodeHeaderContents(
   const char* GRAD_NODE_TEMPLATE =
       "class GradNode%s : public egr::GradNodeBase {\n"
       " public:\n"
-      "  GradNode%s() : egr::GradNodeBase() {}\n"
+      "  GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
+      "GradNode%s \"; }\n"
       "  GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
-      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n"
-      "  ~GradNode%s() override = default;\n"
+      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
+      "Construct GradNode%s \"; }\n"
+      "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
       "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
       "override;\n"
       "\n"
+      "  std::string name() override { return \" GradNode%s \"; } \n "
+      "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
-      "  std::string name() { return \"GradNode%s\"; }\n"
-      "\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
@@ -2195,8 +2217,8 @@ static std::string GenerateGradNodeHeaderContents(
   VLOG(6) << "Generated TensorWrapper";
 
   std::string grad_node_str = paddle::string::Sprintf(
-      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
+      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
       tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
@@ -2242,8 +2264,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
-      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
+      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
+      "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
   std::string forward_cc_include_str =
       paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
@@ -2348,6 +2371,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     if (!CheckOpProto(op_proto)) continue;
     const std::string& op_type = op_proto->type();
+    if (black_ops_list.count(op_type)) {
+      continue;
+    }
 
     /* ----------------------------- */
     /* ---- Collect Information ---- */
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index c6bca01205e19c58d5924f4e9d60bb76164fee2b..53af6c1048d2454b1e9f375b837103930026ae54 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
+set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
+set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
 set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 02183e2ca5ce9f0996017eb7df59ee716b0f1ae2..656418a05ad6d04bc19838c97d86db9cda19c1c6 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -23,15 +23,18 @@ core_ops_returns_info = {}
 core_ops_args_info = {}
 core_ops_args_type_info = {}
 
+namespace = ""
 
 yaml_types_mapping = {
-    'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
-  'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
-  'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \
-  'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
+    'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
+    'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
+    'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
-    'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>'
+    'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
+    'Scalar' : 'paddle::experimental::Scalar',
+    'ScalarArray' : 'paddle::experimental::ScalarArray'
 }
 
 
@@ -123,6 +126,7 @@ def GetAutoGradMetaVectorName(string):
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
     contents = yaml.load(f, Loader=yaml.FullLoader)
+    f.close()
     return contents
 
 
@@ -131,15 +135,25 @@ def ReadBwdFile(filepath):
     contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
     for content in contents:
-        assert 'backward_api' in content.keys()
-        api_name = content['backward_api']
+        if 'backward_api' in content.keys():
+            api_name = content['backward_api']
+        else:
+            assert False
+
         ret[api_name] = content
+    f.close()
     return ret
 
 
 ######################
 ###  Yaml Parsers  ###
 ######################
+def RemoveSpecialSymbolsInName(string):
+    # Remove any name after '@'
+    ret = string.split("@")[0]
+    return ret
+
+
 def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
     # intermediate_outputs : [name0, name1, ...]
     # forward_returns_list : [[ret_name, type, orig_pos], ...]
@@ -158,15 +172,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
 
 def ParseDispensable(string):
     # string: "X, Y"
+    string = RemoveSpecialSymbolsInName(string)
     return [v.strip() for v in string.split(",")]
 
 
 def ParseIntermediate(string):
+    string = RemoveSpecialSymbolsInName(string)
     return [v.strip() for v in string.split(",")]
 
 
 def ParseNoNeedBuffer(string):
     # string: "x, y"
+    string = RemoveSpecialSymbolsInName(string)
+
     no_need_buffer_set = set()
     for name in string.split(","):
         no_need_buffer_set.add(name.strip())
@@ -196,6 +214,8 @@ def ParseYamlArgs(string):
 
         assert arg_type in yaml_types_mapping.keys()
         arg_type = yaml_types_mapping[arg_type]
+
+        arg_name = RemoveSpecialSymbolsInName(arg_name)
         if "Tensor" in arg_type:
             assert default_value is None
             inputs_list.append([arg_name, arg_type, i])
@@ -206,40 +226,32 @@ def ParseYamlArgs(string):
 
 
 def ParseYamlReturns(string):
-    # Example: Tensor, Tensor
-
-    # list = [ ["", ret_type, orig_position], ...]
-    returns_list = []
-
-    returns = [x.strip() for x in string.strip().split(",")]
-    for i in range(len(returns)):
-        ret = returns[i]
-        returns_list.append(["", ret, i])
-
-    return returns_list
-
-
-def ParseYamlReturnsWithName(string):
-    # Example: Tensor(out), Tensor(out1)
+    # Example0: Tensor(out), Tensor(out1)
+    # Example1: Tensor, Tensor
+    # Example2: Tensor[](out), Tensor
 
     # list = [ [ret_name, ret_type, orig_position], ...]
     returns_list = []
 
     returns = [x.strip() for x in string.strip().split(",")]
 
-    atype = r'(.*?)'
-    aname = r'(.*?)'
-    pattern = f'{atype}\({aname}\)'
     for i in range(len(returns)):
         ret = returns[i]
-        m = re.search(pattern, ret)
-        ret_type = m.group(1)
-        ret_name = m.group(2)
+
+        ret_name = ""
+        if "(" in ret and ")" in ret:
+            # Remove trailing ')'
+            ret = ret[:-1]
+            ret_type = ret.split("(")[0].strip()
+            ret_name = ret.split("(")[1].strip()
+        else:
+            ret_type = ret.strip()
 
         assert ret_type in yaml_types_mapping.keys()
         ret_type = yaml_types_mapping[ret_type]
 
         assert "Tensor" in ret_type
+        ret_name = RemoveSpecialSymbolsInName(ret_name)
         returns_list.append([ret_name, ret_type, i])
 
     return returns_list
@@ -260,7 +272,7 @@ def ParseYamlForwardFromBackward(string):
     function_returns = m.group(3)
 
     forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
-    forward_returns_list = ParseYamlReturnsWithName(function_returns)
+    forward_returns_list = ParseYamlReturns(function_returns)
 
     return forward_inputs_list, forward_attrs_list, forward_returns_list
 
@@ -290,7 +302,7 @@ def ParseYamlBackward(args_str, returns_str):
     args_str = re.search(args_pattern, args_str).group(1)
 
     inputs_list, attrs_list = ParseYamlArgs(args_str)
-    returns_list = ParseYamlReturnsWithName(returns_str)
+    returns_list = ParseYamlReturns(returns_str)
 
     return inputs_list, attrs_list, returns_list
 
@@ -516,11 +528,18 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
         set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format(
             aname, GetConstReference(atype), aname, saved_attr_name, aname)
 
-        ATTRIBUTE_MEMBER_TEMPLATE = """
-   {} {} = {};
-"""
-        attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
-            RemoveConstAndReference(atype), saved_attr_name, default_val)
+        if default_val:
+            ATTRIBUTE_MEMBER_TEMPLATE = """
+       {} {} = {};
+    """
+            attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
+                RemoveConstAndReference(atype), saved_attr_name, default_val)
+        else:
+            ATTRIBUTE_MEMBER_TEMPLATE = """
+       {} {};
+    """
+            attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
+                RemoveConstAndReference(atype), saved_attr_name)
     # End: SetAttributes & Attribute Members
 
     grad_node_name = GetGradNodeName(fwd_api_name)
@@ -534,7 +553,7 @@ class {} : public egr::GradNodeBase {{
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
-  
+  std::string name() override {{ return \" {} \"; }}
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
@@ -549,8 +568,9 @@ class {} : public egr::GradNodeBase {{
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        set_tensor_wrapper_methods_str, set_attribute_methods_str,
-        tensor_wrapper_members_str, attribute_members_str)
+        grad_node_name, set_tensor_wrapper_methods_str,
+        set_attribute_methods_str, tensor_wrapper_members_str,
+        attribute_members_str)
 
     return node_declaration_str
 
@@ -607,16 +627,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
+
+    if len(namespace) > 0:
+        grad_api_namespace = f"paddle::experimental::{namespace}"
+    else:
+        grad_api_namespace = f"paddle::experimental"
+
     FUNCTION_TEMPLATE = """
 std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
     // Call grad_api function
-    auto grad_api_returns = paddle::experimental::{}({});
+    auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
+        grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
+        returns_str)
 
     return node_definition_str
 
@@ -670,7 +697,7 @@ def GenerateNodeCreationCodes(
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                outputs_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
             else:
                 assert IsVectorTensorType(rtype)
                 output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
@@ -698,18 +725,24 @@ def GenerateNodeCreationCodes(
 
     # SetTensorWrappers
     set_tensor_wrappers_list = []
-    for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+    for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
         is_optional = (name in optional_inputs)
+
         if is_fwd_input:
             if is_optional:
                 set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
             else:
                 set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
+            if IsVectorTensorType(atype):
+                tw_name = f"api_result[{pos}]"
+            else:
+                tw_name = f"api_result"
+
             if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -849,7 +882,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         function_name = fwd_api_name
     else:
         function_name = fwd_api_name + "_intermediate"
-    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
+
+    if len(namespace) > 0:
+        forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
+    else:
+        forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
     num_outputs = len(forward_outputs_position_map.keys()) - len(
@@ -886,8 +923,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         backward_fwd_input_map, backward_grad_input_map,
         backward_grad_output_map, backward_attrs_list, optional_inputs)
 
+    node_event_name = fwd_api_name + " node_creation"
+    NODE_CREATION_TEMPLATE = """{{\n
+           paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
+           {}\n
+        }}"""
+    node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
+                                                      node_creation_str)
+
+    dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
+
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
+    {}
+
     // Forward API Call
     {}
     
@@ -901,7 +950,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     forward_function_name = GetForwardFunctionName(fwd_api_name)
     forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
         returns_type_str, forward_function_name, inputs_args_definition_str,
-        forward_call_str, node_creation_str, returns_str)
+        dygraph_event_str, forward_call_str, node_creation_str, returns_str)
     forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
 
     return forward_function_str, forward_function_declaration_str
@@ -999,7 +1048,9 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1020,10 +1071,13 @@ def GenerateNodeHFile(filepath, node_declaration_str):
 
 def GenerateForwardCCFile(filepath, forward_definition_str):
     file_contents = """
+#include "paddle/phi/api/lib/dygraph_api.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 """
 
@@ -1041,6 +1095,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 #include "paddle/phi/api/all.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 
 """
     file_contents += GenerateCoreOpInfoDeclaration()
@@ -1052,134 +1107,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    backward_yaml_path = args.backward_yaml_path
-
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-    grad_api_dict = ReadBwdFile(backward_yaml_path)
+    api_yaml_paths = args.api_yaml_path.split(",")
+    backward_yaml_paths = args.backward_yaml_path.split(",")
 
     # Generate per Dygraph API
     node_declaration_str = ""
     node_definition_str = ""
     forward_definition_str = ""
     forward_declaration_str = ""
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
 
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        no_need_buffer_set = set()
-        if 'no_need_buffer' in fwd_api.keys():
-            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        bwd_api_name = fwd_api['backward']
-        assert bwd_api_name in grad_api_dict.keys()
-        bwd_api = grad_api_dict[bwd_api_name]
-
-        assert 'args' in bwd_api.keys()
-        assert 'output' in bwd_api.keys()
-        assert 'forward' in bwd_api.keys()
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        bwd_forward_str = bwd_api['forward']
-        bwd_args_str = bwd_api['args']
-        bwd_returns_str = bwd_api['output']
-
-        # Collect Forward Inputs/Outputs
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
-            bwd_forward_str)
-        print("Parsed Forward Inputs List: ", forward_inputs_list)
-        print("Prased Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Forward Returns List: ", forward_returns_list)
-
-        intermediate_outputs = []
-        if 'intermediate' in fwd_api.keys():
-            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
-
-        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
-        print("Parsed Original Forward Returns List: ",
-              orig_forward_returns_list)
-
-        # Forward Validation Checks
-        ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
-                                forward_returns_list, orig_forward_inputs_list,
-                                orig_forward_attrs_list,
-                                orig_forward_returns_list)
-
-        # Parse Backward Inputs/Outputs
-        backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
-            bwd_args_str, bwd_returns_str)
-        print("Parsed Backward Inputs List: ", backward_inputs_list)
-        print("Prased Backward Attrs List: ", backward_attrs_list)
-        print("Parsed Backward Returns List: ", backward_returns_list)
-
-        # Determine Forward Inputs/Outputs Position
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        # SlotName Matching
-        backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
-            backward_inputs_list, backward_returns_list,
-            forward_inputs_position_map, forward_outputs_position_map)
-        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
-        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
-        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
-
-        # Backward Validation Check
-        BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
-                                backward_attrs_list)
-
-        # Node Declaration Generation
-        node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
-            no_need_buffer_set)
-        print("Generated Node Declaration: ", node_declaration_str)
-
-        node_definition_str += GenerateNodeDefinition(
-            fwd_api_name, bwd_api_name, backward_fwd_input_map,
-            backward_grad_input_map, backward_grad_output_map,
-            backward_attrs_list)
-        print("Generated Node Definition: ", node_definition_str)
-
-        # Node Definition Generation
-        definition_declaration_pair = GenerateForwardDefinition(
-            fwd_api_name, bwd_api_name, forward_inputs_position_map,
-            forward_outputs_position_map, forward_attrs_list,
-            backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list, optional_inputs,
-            intermediate_outputs)
-        print("Generated Forward Definition: ", forward_definition_str)
-        print("Generated Forward Declaration: ", forward_declaration_str)
-        forward_definition_str += definition_declaration_pair[0]
-        forward_declaration_str += definition_declaration_pair[1]
-
-        # For python-level API dispatch
-        CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
-                                  forward_outputs_position_map,
-                                  forward_attrs_list)
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+        backward_yaml_path = backward_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            assert "sparse" in backward_yaml_path
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+        grad_api_dict = ReadBwdFile(backward_yaml_path)
+
+        yaml_forward_definition_str = ""
+        yaml_forward_declaration_str = ""
+        yaml_node_declaration_str = ""
+        yaml_node_definition_str = ""
+        for fwd_api in fwd_api_list:
+            # We only generate Ops with grad
+            if 'backward' not in fwd_api.keys():
+                continue
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+            assert 'backward' in fwd_api.keys()
+
+            no_need_buffer_set = set()
+            if 'no_need_buffer' in fwd_api.keys():
+                no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
+                    'no_need_buffer'])
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            bwd_api_name = fwd_api['backward']
+            assert bwd_api_name in grad_api_dict.keys()
+            bwd_api = grad_api_dict[bwd_api_name]
+
+            assert 'args' in bwd_api.keys()
+            assert 'output' in bwd_api.keys()
+            assert 'forward' in bwd_api.keys()
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            bwd_forward_str = bwd_api['forward']
+            bwd_args_str = bwd_api['args']
+            bwd_returns_str = bwd_api['output']
+
+            # Collect Forward Inputs/Outputs
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
+                bwd_forward_str)
+            print("Parsed Forward Inputs List: ", forward_inputs_list)
+            print("Prased Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Forward Returns List: ", forward_returns_list)
+
+            intermediate_outputs = []
+            if 'intermediate' in fwd_api.keys():
+                intermediate_outputs = ParseIntermediate(fwd_api[
+                    'intermediate'])
+
+            IntermediateValidationCheck(intermediate_outputs,
+                                        forward_returns_list)
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ",
+                  orig_forward_inputs_list)
+            print("Prased Original Forward Attrs List: ",
+                  orig_forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  orig_forward_returns_list)
+
+            # Forward Validation Checks
+            ForwardsValidationCheck(
+                forward_inputs_list, forward_attrs_list, forward_returns_list,
+                orig_forward_inputs_list, orig_forward_attrs_list,
+                orig_forward_returns_list)
+
+            # Parse Backward Inputs/Outputs
+            backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
+                bwd_args_str, bwd_returns_str)
+            print("Parsed Backward Inputs List: ", backward_inputs_list)
+            print("Prased Backward Attrs List: ", backward_attrs_list)
+            print("Parsed Backward Returns List: ", backward_returns_list)
+
+            # Determine Forward Inputs/Outputs Position
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            # SlotName Matching
+            backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
+                backward_inputs_list, backward_returns_list,
+                forward_inputs_position_map, forward_outputs_position_map)
+            print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
+            print("Generated Backward Grad Input Map: ",
+                  backward_grad_input_map)
+            print("Generated Backward Grad Output Map: ",
+                  backward_grad_output_map)
+
+            # Backward Validation Check
+            BackwardValidationCheck(backward_fwd_input_map,
+                                    backward_grad_input_map,
+                                    backward_attrs_list)
+
+            # Node Declaration Generation
+            yaml_node_declaration_str += GenerateNodeDeclaration(
+                fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+                no_need_buffer_set)
+            print("Generated Node Declaration: ", node_declaration_str)
+
+            yaml_node_definition_str += GenerateNodeDefinition(
+                fwd_api_name, bwd_api_name, backward_fwd_input_map,
+                backward_grad_input_map, backward_grad_output_map,
+                backward_attrs_list)
+            print("Generated Node Definition: ", node_definition_str)
+
+            # Node Definition Generation
+            definition_declaration_pair = GenerateForwardDefinition(
+                fwd_api_name, bwd_api_name, forward_inputs_position_map,
+                forward_outputs_position_map, forward_attrs_list,
+                backward_fwd_input_map, backward_grad_input_map,
+                backward_grad_output_map, backward_attrs_list, optional_inputs,
+                intermediate_outputs)
+            print("Generated Forward Definition: ", forward_definition_str)
+            print("Generated Forward Declaration: ", forward_declaration_str)
+            yaml_forward_definition_str += definition_declaration_pair[0]
+            yaml_forward_declaration_str += definition_declaration_pair[1]
+
+            # For python-level API dispatch
+            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
+                                      forward_outputs_position_map,
+                                      forward_attrs_list)
+
+        if len(namespace) > 0:
+            forward_definition_str += f"""namespace {namespace} {{
+    {yaml_forward_definition_str}
+}}
+"""
+
+            forward_declaration_str += f"""namespace {namespace} {{
+    {yaml_forward_declaration_str}
+}}
+"""
+
+            node_declaration_str += f"""namespace {namespace} {{
+    {yaml_node_declaration_str}
+}}
+"""
+
+            node_definition_str += f"""namespace {namespace} {{
+    {yaml_node_definition_str}
+}}
+"""
+
+        else:
+            forward_definition_str += yaml_forward_definition_str
+            forward_declaration_str += yaml_forward_declaration_str
+            node_declaration_str += yaml_node_declaration_str
+            node_definition_str += yaml_node_definition_str
 
     # Generate Files
     nodes_h_path = args.nodes_h_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 9329dc5ffc9dd0faa36b8ff6a8373387bc2678c7..9b77f0449e01d6555cd3a25f101e4867ccc6ffd3 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,34 +14,28 @@
 
 import os
 import argparse
-from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+
+skipped_fwd_api_names = set(["scale"])
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
     "int": "CastPyArg2Int",
     "long": "CastPyArg2Long",
+    "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
     "string": "CastPyArg2String",
-    "bool[]": "CastPyArg2Booleans",
-    "int[]": "CastPyArg2Ints",
-    "long[]": "CastPyArg2Longs",
-    "float[]": "CastPyArg2Floats",
-    "double[]": "CastPyArg2Float64s",
-    "string[]": "CastPyArg2Strings"
-}
-
-atype_to_cxx_type = {
-    "bool": "bool",
-    "int": "int",
-    "long": "long",
-    "float": "float",
-    "string": "std::string",
-    "bool[]": "std::vector<bool>",
-    "int[]": "std::vector<int>",
-    "long[]": "std::vector<long>",
-    "float[]": "std::vector<float>",
-    "double[]": "std::vector<double>",
-    "string[]": "std::vector<std::string>"
+    "std::vector<bool>": "CastPyArg2Booleans",
+    "std::vector<int>": "CastPyArg2Ints",
+    "std::vector<long>": "CastPyArg2Longs",
+    "std::vector<int64_t>": "CastPyArg2Longs",
+    "std::vector<float>": "CastPyArg2Floats",
+    "std::vector<double>": "CastPyArg2Float64s",
+    "std::vector<std::string>": "CastPyArg2Strings",
+    "paddle::experimental::Scalar": "CastPyArg2Scalar",
+    "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
+    "paddle::experimental::Backend": "CastPyArg2Backend",
+    "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
 
@@ -55,15 +49,9 @@ def ParseArguments():
     return args
 
 
-def GetCxxType(atype):
-    if atype not in atype_to_cxx_type.keys():
-        assert False
-
-    return atype_to_cxx_type[atype]
-
-
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
+        print(f"Unable to find {atype} in atype_to_parsing_function.")
         assert False
 
     return atype_to_parsing_function[atype]
@@ -71,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype):
 
 def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
                             forward_attrs_list, forward_outputs_position_map,
-                            optional_inputs):
+                            optional_inputs, is_forward_only):
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
     # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
@@ -98,18 +86,21 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
     # Get Attributes
     for name, atype, _, pos in forward_attrs_list:
         parsing_function = FindParsingFunctionFromAttributeType(atype)
-        cxx_type = GetCxxType(atype)
         key = f"{name}"
 
         parse_attributes_str += f"    PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n"
-        parse_attributes_str += f"    {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
+        parse_attributes_str += f"    {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
 
         dygraph_function_call_list[pos] = f"{name}"
     dygraph_function_call_str = ",".join(dygraph_function_call_list)
 
+    pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);"
+
     PYTHON_C_FUNCTION_TEMPLATE = """
 static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
 {{
+  {}
+
   PyThreadState *tstate = nullptr;
   try
   {{
@@ -139,11 +130,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
 }}
 
 """
+    namespace_str = ""
+    if len(namespace) > 0:
+        namespace_str = f"{namespace}::"
+
+    if is_forward_only:
+        fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
+    else:
+        fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
+
     python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
-        fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
-        GetForwardFunctionName(fwd_api_name), dygraph_function_call_str)
+        fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str,
+        parse_attributes_str, fwd_function_name, dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
+    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
     return python_c_function_str, python_c_function_reg_str
 
@@ -197,7 +197,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
     """
 
     core_ops_infos_registry = """
-    ,{\"get_final_state_core_ops_args_info\",
+    {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
@@ -225,9 +225,17 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
 #pragma once
 
 #include  "pybind11/detail/common.h"
+#include  "paddle/phi/api/all.h"
+#include  "paddle/phi/api/lib/dygraph_api.h"
+#include  "paddle/phi/common/backend.h"
+#include  "paddle/phi/common/data_type.h"
+#include  "paddle/phi/common/scalar.h"
+#include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
 #include  "paddle/fluid/pybind/op_function_common.h"
 #include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include  "paddle/fluid/pybind/exception.h"
+#include  "paddle/fluid/platform/profiler/event_tracing.h"
 #include  <Python.h>
 
 namespace paddle {{
@@ -257,53 +265,80 @@ def GeneratePythonCFile(filepath, python_c_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-
-    python_c_function_list = []
-    python_c_function_reg_list = []
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
-
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Original Forward Returns List: ", forward_returns_list)
-
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-            fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map, optional_inputs)
-        python_c_function_list.append(python_c_function_str)
-        python_c_function_reg_list.append(python_c_function_reg_str)
-        print("Generated Python-C Function: ", python_c_function_str)
-
-    python_c_functions_str = "\n".join(python_c_function_list)
-    python_c_functions_reg_str = ",\n".join(python_c_function_reg_list)
+    api_yaml_paths = args.api_yaml_path.split(",")
+
+    python_c_functions_reg_str = ""
+    python_c_functions_str = ""
+
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+
+        python_c_function_list = []
+        python_c_function_reg_list = []
+        for fwd_api in fwd_api_list:
+
+            # We only generate Ops with grad
+            is_forward_only = False
+            if 'backward' not in fwd_api.keys():
+                is_forward_only = True
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            if fwd_api_name in skipped_fwd_api_names:
+                continue
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ", forward_inputs_list)
+            print("Prased Original Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  forward_returns_list)
+
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
+                fwd_api_name, forward_inputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, optional_inputs, is_forward_only)
+            python_c_function_list.append(python_c_function_str)
+            python_c_function_reg_list.append(python_c_function_reg_str)
+            print("Generated Python-C Function: ", python_c_function_str)
+
+        # Append Namespace
+        python_c_functions_reg_str += ",\n".join(
+            python_c_function_reg_list) + ","
+        python_c_functions = "\n".join(python_c_function_list)
+        if len(namespace) > 0:
+            python_c_functions_str += f"""namespace {namespace} {{
+    {python_c_functions}
+}}
+"""
+
+        else:
+            python_c_functions_str += python_c_functions
 
     python_c_str = GeneratePythonCWrappers(python_c_functions_str,
                                            python_c_functions_reg_str)
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3..dca76d3b8a0db8c4284960005bfbad33ce23e20d 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta {
  private:
   // TODO(jiabin) :Should we use pointer instead of object?
   std::shared_ptr<paddle::experimental::Tensor> grad_{
-      std::make_shared<paddle::experimental::Tensor>(
-          egr::Controller::Instance().GenerateUniqueName("@grad"))};
+      std::make_shared<paddle::experimental::Tensor>()};
 
   // GradNodeBase is base class of all grad op which is a
   // wrapper for grad op. This class will make grad op easy
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 356fdcaf054277085be57491eb1525beeac8d792..1987d024d8f3e34121f54962c45f0f8c1e91b723 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -19,6 +19,8 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -48,12 +50,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
     }
     visited.insert(node);
 
+    PADDLE_ENFORCE_NOT_NULL(
+        node,
+        paddle::platform::errors::Fatal(
+            "We got null node when we traverse the backward graph, and this "
+            "should not happened please check your code and contact us."));
     // Find and append next nodes
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
     for (const auto& edge_list : edges) {
       for (const Edge& edge : edge_list) {
         GradNodeBase* next_node = edge.GetMutableGradNode().get();
-
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
@@ -67,13 +73,15 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
       }
     }
   }
-
   return node_in_degree_map;
 }
 
 void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
                  const std::vector<paddle::experimental::Tensor>& grad_tensors,
                  bool retain_graph) {
+  paddle::platform::RecordEvent backward_record_event(
+      "backward", paddle::platform::TracerEventType::Operator, 1);
+
   VLOG(6) << "Start Backward";
   // *Gradient Hook should happen at node-level
   // *Inplace version check should perform at node-level
@@ -109,7 +117,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
     // Prepare GradTensorHolder
     if (!node_input_buffers_dict.count(grad_node)) {
-      VLOG(6) << "Create Value for grad input tensor " << i;
+      VLOG(6) << "Create Value for grad input tensor " << i
+              << " of grad node: " << grad_node->name();
       node_input_buffers_dict[grad_node] =
           std::make_unique<GradTensorHolder>(grad_node->InputMeta());
     }
@@ -155,19 +164,27 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
   VLOG(6) << "Run Backward";
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
-    queue.pop();
 
+    paddle::platform::RecordEvent node_record_event(
+        std::string(typeid(*node).name()) + " grad_node",
+        paddle::platform::TracerEventType::Operator, 1);
+
+    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
+      queue.pop();
+      continue;
+    }
+    queue.pop();
     // Run node: This is where Hook happens
     PADDLE_ENFORCE(
         node_input_buffers_dict.count(node),
         paddle::platform::errors::Fatal(
-            "Unable to find next node in the InputBuufer"
+            "Unable to find next node in the GradTensorHolder \n"
             "Trying to run Node without configuring its GradTensorHolder"));
 
     std::unique_ptr<GradTensorHolder> node_input_buffer =
         std::move(node_input_buffers_dict[node]);
 
-    VLOG(6) << "Run Backward Kernel with input_buffer";
+    VLOG(6) << "Run Backward Kernel with GradTensorHolder";
     // Run Pre Backward Node and get outputs
     std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
         (*node)(node_input_buffer->Buffers());
@@ -212,9 +229,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         if ((!grad_output_tensor.defined() ||
              !grad_output_tensor.initialized())) {
-          VLOG(6)
-              << "We get grad_output_tensor with slot: " << i << ", rank: " << j
-              << " as uninitialized or undefined in both tensor and variable";
+          VLOG(6) << "We get grad_output_tensor with slot: " << i
+                  << ", rank: " << j << " as uninitialized or undefined tensor";
         }
         VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
                 << ", rank: " << j
@@ -225,6 +241,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           const auto& input_meta = next_node->InputMeta();
           auto grad_tensor_holder =
               std::make_unique<GradTensorHolder>(input_meta);
+          VLOG(6) << "Construct GradTensorHolder for grad node: "
+                  << next_node->name();
           node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
         VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
@@ -234,10 +252,12 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         // Update queue
         node_in_degree_map[next_node]--;
-        PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0,
-                       paddle::platform::errors::Fatal(
-                           "Detected in-degree value smaller than zero."
-                           "Node's in-degree cannot be negative"));
+        PADDLE_ENFORCE(
+            node_in_degree_map[next_node] >= 0,
+            paddle::platform::errors::Fatal(
+                "Detected in-degree value smaller than zero. For Node: %s"
+                "Node's in-degree cannot be negative",
+                next_node->name()));
         if (node_in_degree_map[next_node] == 0) {
           queue.emplace(std::move(next_node));
         }
diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ccc9a03a55660772b51dc27bbfa78b7531a369d3
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48ac8c8358afd68cee9d22b8ea0a4e8fd7c3c92e
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace egr {
+std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
+operator()(
+    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+  paddle::CustomOpKernelContext ctx;
+  auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
+      egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs(
+      egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+
+  std::vector<std::vector<paddle::experimental::Tensor>> tmp_ins(
+      grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+  for (size_t i = 0; i < grads.size(); i++) {
+    if (map[1].find(i) != map[1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
+      tmp_ins[map[1][i]] = grads[i];
+    }
+  }
+
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
+  }
+
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
+  }
+
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  std::vector<std::vector<paddle::experimental::Tensor>> outs(
+      GetEdges().size());
+  std::vector<std::vector<paddle::experimental::Tensor>> tmp_outs(
+      grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+  for (size_t i = 0; i < GetEdges().size(); i++) {
+    if (map[0].find(i) != map[0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << GetEdges()[i].size()
+              << " to tmp_outputs: " << map[0][i];
+      for (size_t j = 0; j < GetEdges()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[1]))(&ctx);
+  return outs;
+}
+}  // namespace egr
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5ddef9c062149282d790a5fd6bf31b25a20cf5a
--- /dev/null
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/utils/any.h"
+
+namespace egr {
+class RunCustomOpNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num,
+                           const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpNode for op: " << op_type;
+  }
+
+  ~RunCustomOpNode() override {
+    VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_;
+  }
+
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
+      override;
+
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
+  }
+
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover(nullptr));
+    }
+    return res;
+  }
+
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index b1189106b8f871ab618972ad93e9812ce443e55d..7eb2902d935c4fd8d5990c81fbf6bcf3fd6e6e66 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -25,11 +25,12 @@
 #include "glog/logging.h"
 
 /**
- * Implementation of GradNodeBase, Edge and InputBuffer.
+ * Implementation of GradNodeBase, Edge and GradTensorHolder.
 **/
 namespace egr {
 
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
+  VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
   // adj_edges has the same num as backward outputs
@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node) {
+      if (node && node.get()) {
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       }
@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "inputs's slot num."));
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node) {
+    if (node && node.get()) {
       VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
               << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index eeac1cca4acf33190ce30613e4a86e99a95b651b..16513f05e0777a8e57f54c925d68867dda656612 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -76,10 +76,10 @@ class GradSlotMeta {
 
 class GradNodeBase {
  public:
-  GradNodeBase() = default;
+  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
   // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() = default;
+  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
 
   /**
    * operator() designed to contian the real backward execution logic, it should
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index bb84e2dda81bafe624fe7734a0a47391eeb0adfa..535c93ac53b1751d9634476e47f32dc0cbe22708 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode(float val, int in_num, int out_num)
       : GradNodeBase(in_num, out_num), val_(val) {}
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
+  std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
       override {
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 8c6eeca9d3d5d80fd5bfe943ef87ba8640ada4f2..384fdcd6f97c4b318341db68cdd88b644d42d22a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -24,6 +24,8 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
+
 // TODO(jiabin): remove nolint here!!!
 using namespace egr;  // NOLINT
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 6c4bf9a4f17e6f88503f0a1d6ec2f3029000b6f0..056c7102f663b93d215e494908d9c95be832068c 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -33,6 +33,16 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
@@ -72,6 +82,47 @@ TEST(Benchmark, EagerScaleCPU) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCPU) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  for (const std::string& mode : {"Accuracy", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cpu.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 14e7ce8cfcfb4dea0907cd128873223c8e5859a2..5e790389819f53b250db8797c7a8b3466818abfb 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -32,11 +32,21 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+
 TEST(Benchmark, EagerScaleCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
@@ -74,6 +84,50 @@ TEST(Benchmark, EagerScaleCUDA) {
   }
 }
 
+TEST(Benchmark, EagerMatmulCUDA) {
+  paddle::platform::CUDAPlace place;
+  eager_test::InitEnv(place);
+
+  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor X = CreateTensorWithValue(
+        ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 1.0, true);
+    RetainGradForTensor(X);
+
+    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::experimental::Tensor Y = CreateTensorWithValue(
+        ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
+        phi::DataLayout::NCHW, 2.0, true);
+    RetainGradForTensor(Y);
+
+    if (mode == "Accuracy") {
+      benchmark_eager_matmul(X, Y, true /* accuracy_check */);
+
+    } else if (mode == "WarmUp") {
+      benchmark_eager_matmul(X, Y);
+
+    } else if (mode == "Performance") {
+      auto t_start = std::chrono::high_resolution_clock::now();
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart("eager_matmul_cuda.out");
+#endif
+      benchmark_eager_matmul(X, Y);
+
+#ifdef WITH_GPERFTOOLS
+      ProfilerStop();
+#endif
+      auto t_end = std::chrono::high_resolution_clock::now();
+      double elapsed_time_ms =
+          std::chrono::duration<double, std::milli>(t_end - t_start).count();
+      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
+
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
+    }
+  }
+}
+
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
   eager_test::InitEnv(place);
@@ -186,7 +240,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 3292de9363696dae30d853980eca6fb1ba1055cc..b4b47a85f66662347d5e087cd4391979fb6c4250 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -34,6 +34,16 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index e9b7d10070dbf22f10e617d34f143992d19fb659..a3e393b039425e506066b485bc8a8688bff20d96 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -34,8 +34,18 @@
 #include "gperftools/profiler.h"
 #endif
 
+#include "paddle/phi/core/kernel_registry.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace imperative {
 
@@ -248,7 +258,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 96126fa5466aace442dfb742f9902539916b853e..769bd7f687f4584d44bbfa30b73611a3128289bf 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/utils.h"
 
 // Eager Generated
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 
 // Fluid
@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 }
 
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
+                            bool accuracy_check) {
+  paddle::experimental::Tensor input_tensor0 = X;
+
+  size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
+  for (size_t i = 0; i < max_num_runs; i++) {
+    input_tensor0 =
+        matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
+  }
+
+  std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
+  RunBackward(target_tensors, {});
+
+  if (accuracy_check) {
+    // Examine Forward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareTensorWithValue<float>(input_tensor0, 16);
+    // Examine Backward Grad (w.r.t max_num_runs = 2)
+    eager_test::CompareGradTensorWithValue<float>(X, 16);
+    eager_test::CompareGradTensorWithValue<float>(Y, 16);
+  }
+}
+
 /* ----------------------------------- */
 /* ---- Eager Intermediate Matmul ---- */
 /* ----------------------------------- */
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 0086b51b57e152c6da935eacba8d93c0d6ab1a71..86bf13707ed40b0c37ccb54695cca3d165768cb6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
                            bool accuracy_check = false);
 
 /* ---- Eager MatMul ---- */
-/*
-void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const
-paddle::experimental::Tensor& Y,
+void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
+                            const paddle::experimental::Tensor& Y,
                             bool accuracy_check = false);
-void benchmark_eager_mlp(const paddle::experimental::Tensor& X,
-                         const std::vector<paddle::experimental::Tensor>& Ws,
-                         const std::vector<paddle::experimental::Tensor>& Bs,
-                         bool accuracy_check = false);
-*/
+
 void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
                                          const paddle::experimental::Tensor& Y,
                                          bool accuracy_check = false);
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index a4bc56bd606f3fbb0f9152d58acb5c8edeecf905..0c894ed267fcdd08d44d4df08bfaf0554874aebf 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,6 +30,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Backward, SingleNodeEmptyGrad) {
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 524872b2e55638d25697388aa50724f49f6e3818..36594f1aac8cdb131bb77f1396dca19a0c2e8cc0 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -31,6 +31,10 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(CrossBatchAccumulation, SingleScaleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 49bbfc77741a5b82ac9a564e25b484e5dabf77a7..dc44d95daac1d9109bbf2a1d04a8a47b081cead9 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -27,6 +27,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Forward, SingleNode) {
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 5a7bafb2fe37051c0ad054c130d77dd6e05319d2..f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -30,6 +30,13 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+#endif
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 4b7077b13bdd6c48a0a3846656bd3a6337eb9f80..2a5ad53204a6201149bec0b3dac0fa3baf441f2e 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -30,6 +30,12 @@
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(Generated, Sigmoid) {
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 9cda961741f55e9b4b7fc8dac61fe4a7c96567cf..d546df4ed087a99a28096a5336fab3826991534a 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -31,6 +31,10 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 15b2a62dca751859882e82d46acaa46f27c2c518..56813c498d2410caa452da7a334c393b230c65bf 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -27,6 +27,12 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 paddle::experimental::Tensor hook_function(
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index ea821d195099f3d632e0d1b2d4937bac812563c8..24e5da060111f083ef9b65574e75295fa07f8f43 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -23,6 +23,10 @@
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace egr {
 
 TEST(TensorUtils, Test) {
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..9967d8c36900f45fdd76272bc4416df1d30f2a6a
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/utils.h"
+
+inline void run_program_dygraph_function(
+    const std::vector<paddle::experimental::Tensor>& x,
+    const std::vector<paddle::experimental::Tensor>& params,
+    std::vector<paddle::experimental::Tensor*>& out,     // NOLINT
+    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor*>& dout,    // NOLINT
+    const paddle::framework::AttributeMap& attrs) {
+  VLOG(2) << "start run run_program";
+  // Call forward function
+  RunProgramAPI(x, params, out, step_scope, dout, attrs);
+  VLOG(2) << "start run run_program grad";
+
+  // Prepare Autograd Meta
+  auto deref_out = details::DereferenceTensors(out);
+  std::vector<egr::AutogradMeta*> p_autograd_x =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*> p_autograd_params =
+      egr::EagerUtils::nullable_autograd_meta(params);
+  std::vector<egr::AutogradMeta*> p_autograd_outs =
+      egr::EagerUtils::nullable_autograd_meta(deref_out);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, &p_autograd_x, &p_autograd_params);
+
+  if (require_any_grad) {
+    std::vector<std::string> out_names;
+    for (auto& t : deref_out) {
+      out_names.emplace_back(t.name());
+    }
+
+    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
+    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
+    auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
+
+    grad_node->SetFwdOutNames(out_names);
+    grad_node->SetOut(out);
+    // Set Attributes
+    grad_node->SetAttrMap(attrs);
+    // Set TensorWrappers
+    grad_node->SetFwdX(x);
+    grad_node->SetFwdParams(params);
+    grad_node->SetStepScope(step_scope);
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+
+    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    // Set Next Edges
+    grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
+    grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
+
+    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
+
+    // Set History for output set current Grad Node for
+    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
+    egr::EagerUtils::CheckAndRetainGrad(deref_out);
+  }
+}
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..d99624e49324853d513a20a725c1a3d12b6aaab5
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -0,0 +1,481 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
+#include "paddle/fluid/operators/run_program_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace details {
+using Tensor = paddle::experimental::Tensor;
+
+static std::vector<Tensor> DereferenceTensors(
+    const std::vector<Tensor *> &tensor_ptr) {
+  std::vector<Tensor> res;
+  for (auto *t : tensor_ptr) {
+    res.emplace_back(*t);
+  }
+  return res;
+}
+
+static std::vector<std::string> GetTensorsName(const std::vector<Tensor> &ins) {
+  std::vector<std::string> in_names;
+  for (auto &in_t : ins) {
+    in_names.emplace_back(in_t.name());
+  }
+  return in_names;
+}
+
+static std::vector<std::string> GetTensorsName(
+    const std::vector<Tensor *> &ins) {
+  std::vector<std::string> in_names;
+  for (auto *in_t : ins) {
+    in_names.emplace_back(in_t->name());
+  }
+  return in_names;
+}
+
+static void CheckInputVarStatus(const Tensor &tensor) {
+  PADDLE_ENFORCE_EQ(
+      tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
+      paddle::platform::errors::InvalidArgument(
+          "The input tensor %s of "
+          "RunProgram(Grad)Op holds "
+          "wrong type. Expect type is DenseTensor.",
+          tensor.name()));
+
+  PADDLE_ENFORCE_EQ(tensor.initialized(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "The tensor in input tensor %s of "
+                        "RunProgram(Grad)Op "
+                        "is not initialized.",
+                        tensor.name()));
+}
+
+static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
+                                 const Tensor &dst_tensor) {
+  auto name = dst_tensor.name();
+  PADDLE_ENFORCE_EQ(dst_tensor.defined(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "dst_tensor shall be defined."));
+
+  if (phi::DenseTensor::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::DenseTensor>();
+    PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensor %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is DenseTensor",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's internal "
+                          "scope is not initialized.",
+                          name));
+  } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::SelectedRows>();
+    PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensodfr %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is SelectedRows",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's "
+                          "internal scope is not initialized.",
+                          name));
+
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The RunProgram(Grad)Op only support output "
+        "variable of type LoDTensor or SelectedRows",
+        name));
+  }
+}
+
+static void ShareTensorsIntoScope(const std::vector<Tensor> &tensors,
+                                  paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto name = tensors[i].name();
+    if (name == "Fake_var" || !tensors[i].is_initialized()) {
+      continue;
+    }
+    auto *var = scope->Var(name);
+    CheckInputVarStatus(tensors[i]);
+    // share tensor
+    auto tensor_base = tensors[i].impl();
+    if (phi::DenseTensor::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::DenseTensor>();
+      auto t = std::dynamic_pointer_cast<phi::DenseTensor>(tensor_base);
+      *dst_tensor = *t;
+    } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::SelectedRows>();
+      auto t = std::dynamic_pointer_cast<phi::SelectedRows>(tensor_base);
+      *dst_tensor = *t;
+    }
+  }
+}
+
+static void ShareTensorsFromScope(
+    const std::vector<Tensor *> &tensors,
+    const paddle::framework::BlockDesc &global_block,
+    paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all
+    // parameters before generating out_tmp have no @GRAD, it will raise error
+    // because we can't find them in scope. So we skip sharing these vars or
+    // var@GRAD if they don't appear in global block.
+    auto &name = tensors[i]->name();
+    if (name == paddle::framework::kEmptyVarName || name == "Fake_var" ||
+        !global_block.HasVar(name)) {
+      VLOG(2) << "find tensor name is " << name << ", skip it!";
+      continue;
+    }
+    // NOTE: Here skip not found var is dangerous, if a bug is caused here,
+    // the result is grad calculation error, which will be very hidden!
+    auto *var = scope->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound(
+                                     "The output tensor %s is not in "
+                                     "RunProgram(Grad)Op'"
+                                     "s internal scope.",
+                                     name));
+    CheckOutputVarStatus(*var, *tensors[i]);
+    // share tensor
+    // TODO(dev): Determine Tensor type by scope.var
+    // auto tensor_base = tensors[i]->impl();
+    // if (phi::DenseTensor::classof(tensor_base.get())) {
+    if (var->IsType<phi::DenseTensor>()) {
+      auto &src_tensor = var->Get<phi::DenseTensor>();
+      auto *dst_tensor = const_cast<phi::DenseTensor *>(
+          dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
+      VLOG(2) << "share " << name << " from scope";
+      *dst_tensor = src_tensor;
+    } else if (var->IsType<phi::SelectedRows>()) {
+      // } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto &src_tensor = var->Get<phi::SelectedRows>();
+      auto *dst_tensor = const_cast<phi::SelectedRows *>(
+          dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
+      *dst_tensor = src_tensor;
+    }
+  }
+}
+
+}  // namespace details
+
+inline void RunProgramAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    std::vector<paddle::experimental::Tensor *> &out,     // NOLINT
+    std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor *> &dout,    // NOLINT
+    const paddle::framework::AttributeMap &attrs) {
+  VLOG(2) << "RunProgramOpKernel Compute";
+  auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index"));
+  auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+  auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test"));
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+
+  // NOTE(chenweihang): In order not to add new variable type, use vector
+  // here. Originally, here can use scope directly.
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  // Step 2. prepare executor and init persistable variables
+
+  // NOTE(Aurelius84): While training some models, forward can be called many
+  // times and then apply backpropagation all at once, such as Reinforcement
+  // Learning. Tensor data in multi-step training should be saved into single
+  // scope separately. Otherwise, the gradients can be miscalculated because
+  // always using the Tensor data of the last step in forward.
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  VLOG(2) << "The number of sub scopes before forward: "
+          << out_scope_vec->front()->kids().size();
+  paddle::framework::Scope &scope = global_inner_scope->NewScope();
+
+  // share input_vars & parameters into scope
+  details::ShareTensorsIntoScope(x, &scope);
+  details::ShareTensorsIntoScope(params, &scope);
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto input_names = details::GetTensorsName(x);
+    auto output_names = details::GetTensorsName(out);
+    auto dout_names = details::GetTensorsName(dout);
+    auto *program = global_block->Program();
+
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad=*/false, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+    // all out_vars are skip_eager_var
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, false);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names);
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    output_names.begin(), output_names.end());
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    dout_names.begin(), dout_names.end());
+      paddle::framework::details::ParseSafeEagerDeletionSkipVars(
+          *program, end_op_index, output_names, &skip_eager_delete_vars);
+    }
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
+  }
+  // Step 4. Get Output
+  details::ShareTensorsFromScope(out, *global_block, &scope);
+  details::ShareTensorsFromScope(dout, *global_block, &scope);
+
+  // Debug info: scope info when run end
+  VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+  // Step 5. Drop all children scopes while testing.
+  if (is_test) {
+    out_scope_vec->front()->DropKids();
+  }
+  VLOG(2) << "The number of sub scopes after forward: "
+          << out_scope_vec->front()->kids().size();
+#ifdef PADDLE_WITH_MKLDNN
+  if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+#endif
+}
+
+inline void RunProgramGradAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    const std::vector<paddle::experimental::Tensor> &out_grad,
+    const std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    const paddle::framework::AttributeMap &attrs,
+    std::vector<paddle::experimental::Tensor *> &x_grad,      // NOLINT
+    std::vector<paddle::experimental::Tensor *> &params_grad  // NOLINT
+    ) {
+  // if all output vars are set to stop_gradient, grad op no need to executed
+  if (x_grad.empty() && params_grad.empty()) return;
+
+  // TODO(dev): Remove this line hard code. And need to deal with the out_grad
+  // name problem.
+  // const_cast<paddle::experimental::Tensor &>(out_grad[0])
+  //     .set_name("matmul_v2_0.tmp_0@GRAD");
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+  // NOTE: skip `shape` and `fill_constant` op created by
+  // fluid.backward.gradients, one forward output will generate one `shape`
+  // and `fill_constant`
+  int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2);
+  int64_t end_op_index = global_block->OpSize();
+
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  auto sub_scope_num = global_inner_scope->kids().size();
+  VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+  PADDLE_ENFORCE_GT(sub_scope_num, 0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The OutScope of RunProgramGradOp should hold at "
+                        "least one sub scope."));
+
+  auto &scope = *(global_inner_scope->kids().front());
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto out_grad_names = details::GetTensorsName(out_grad);
+    // NOTE: after PR22939 [Add double grad] merged, the grad op maker's
+    //   SetOutput will set to None if the input var stop_gradient=True,
+    //   it will cause an NotFound error when ctx.OutputNames() is called
+    std::vector<std::string> x_grad_names;
+    std::vector<std::string> param_grad_names;
+    if (!x_grad.empty()) {
+      x_grad_names = details::GetTensorsName(x_grad);
+    }
+    if (!params_grad.empty()) {
+      param_grad_names = details::GetTensorsName(params_grad);
+    }
+
+    // Step 2. prepare executor and scope
+    auto *program = global_block->Program();
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad*/ true, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, true);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names);
+
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    x_grad_names.begin(), x_grad_names.end());
+      paddle::framework::details::AppendSkipDeletionVars(
+          param_grad_names, &skip_eager_delete_vars);
+    }
+
+    details::ShareTensorsIntoScope(out_grad, &scope);
+    // Debug info: scope info when run end
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(
+        /*skip_eager_delete_vars=*/skip_eager_delete_vars);
+  }
+
+  // Step 4. get outputs
+  details::ShareTensorsFromScope(x_grad, *global_block, &scope);
+  details::ShareTensorsFromScope(params_grad, *global_block, &scope);
+
+  // Step5. drop current scope
+  global_inner_scope->DeleteScope(&scope);
+  VLOG(2) << "The number of sub scopes after backward: "
+          << global_inner_scope->kids().size();
+}
+
+class GradNodeRunProgram : public egr::GradNodeBase {
+ public:
+  GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+
+  ~GradNodeRunProgram() override = default;
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
+      override {
+    VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
+    PADDLE_ENFORCE_EQ(
+        grads.size(), 1,
+        paddle::platform::errors::InvalidArgument(
+            "The out_grads.size() of RunProgramGradOp should be equal to 1."));
+
+    VLOG(3) << "out_grads[0].size() : " << grads[0].size();
+    std::vector<paddle::experimental::Tensor> x_grad;
+    std::vector<paddle::experimental::Tensor> params_grad;
+    ConstructGradTensors(x_, &x_grad);
+    ConstructGradTensors(params_, &params_grad);
+    std::vector<paddle::experimental::Tensor *> x_grad_ptr;
+    std::vector<paddle::experimental::Tensor *> params_grad_ptr;
+    for (auto &i : x_grad) {
+      x_grad_ptr.emplace_back(&i);
+    }
+    for (auto &i : params_grad) {
+      params_grad_ptr.emplace_back(&i);
+    }
+
+    // auto x_grad_ptr = ConstructGradTensors(x_);
+    // auto params_grad_ptr = ConstructGradTensors(params_);
+
+    PADDLE_ENFORCE_EQ(
+        grads[0].size(), fwd_out_names_.size(),
+        paddle::platform::errors::InvalidArgument(
+            "The grads[0].size() and fwd_out_names_.size() should be equal."));
+    for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
+      auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad();
+      const_cast<paddle::experimental::Tensor &>(out_grad).set_impl(
+          grads[0][i].impl());
+
+      const_cast<paddle::experimental::Tensor &>(grads[0][i])
+          .set_name(fwd_out_names_[i] + "@GRAD");
+    }
+
+    RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr,
+                      params_grad_ptr);
+    VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
+    return {x_grad, params_grad};
+    // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
+  }
+
+  // SetAttrMap
+  void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
+    attrs_ = attrs;
+  }
+
+  void SetFwdX(const std::vector<paddle::experimental::Tensor> &tensors) {
+    x_ = tensors;
+  }
+
+  void SetFwdParams(const std::vector<paddle::experimental::Tensor> &tensors) {
+    params_ = tensors;
+  }
+
+  void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
+    step_scope_ = scopes;
+  }
+
+  void SetFwdOutNames(std::vector<std::string> out_names) {
+    fwd_out_names_ = out_names;
+  }
+
+  void SetOut(const std::vector<paddle::experimental::Tensor *> &out) {
+    out_ = out;
+  }
+
+ protected:
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors,
+      std::vector<paddle::experimental::Tensor> *grad_tensors) {
+    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // such as: name, tensor type(DenseTensor or SelectedRows).
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      if (phi::DenseTensor::classof(fwd_t.impl().get())) {
+        grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (phi::SelectedRows::classof(fwd_t.impl().get())) {
+        grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
+      auto &grad_t = grad_tensors->back();
+      grad_t.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors) {
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad();
+      grad_tesnor.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+ private:
+  // TensorWrappers
+  std::vector<paddle::experimental::Tensor> x_;
+  std::vector<paddle::experimental::Tensor> params_;
+  std::vector<paddle::framework::Scope *> step_scope_;
+
+  std::vector<std::string> fwd_out_names_;
+  std::vector<paddle::experimental::Tensor *> out_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attrs_;
+};
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 39861c80522a920502fff91177256a4b7abf6dc6..8a57d2694535e9c27e88416468fe5a67ce020b43 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
-    if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
-      VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is "
-                 "detected";
+    if (autograd_meta->GradNode()) {
+      VLOG(7) << "Should not set grad node twice, original node is:"
+              << autograd_meta->GradNode()->name()
+              << "current is: " << grad_node->name();
     }
     autograd_meta->SetGradNode(grad_node);
   }
@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
 
 void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
-  if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
-    VLOG(6)
-        << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected";
+  if (autograd_meta->GradNode()) {
+    VLOG(7) << "Should not set grad node twice, original node is:"
+            << autograd_meta->GradNode()->name()
+            << "current is: " << grad_node->name();
   }
-
   autograd_meta->SetGradNode(grad_node);
 }
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 02d90b9c6da1e9f5ca72124b8661658fe005e214..5dc3d9e89c557e86f5af821446b82ad691ad5c95 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -440,11 +440,11 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw)
+
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
deleted file mode 100644
index 49a1e0774a6b1a7a1afd154029850ceb52040759..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/custom_kernel.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
-#include "paddle/fluid/framework/custom_kernel.h"
-#include "paddle/phi/core/custom_kernel.h"
-
-namespace paddle {
-namespace framework {
-
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
-#ifdef _LINUX
-  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
-  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
-      dlsym(dso_handle, "PD_GetCustomKernelMap"));
-
-  if (func == nullptr) {
-    LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetCustomKernelMap symbol in this lib.";
-    return;
-  }
-  auto& custom_kernel_map = func();
-  phi::RegisterCustomKernels(custom_kernel_map);
-  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
-#else
-  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
-#endif
-  return;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index b9e3bee25f6b5377dde7b525138643964fd8366a..478e39b99dcc9935306603a48810d46ba792d3c3 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
@@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap(
 ////////////////////// User APIs ///////////////////////
 
 // load op api
-void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
   void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
   VLOG(3) << "load custom_op lib: " << dso_name;
   typedef OpMetaInfoMap& get_op_meta_info_map_t();
   auto* get_op_meta_info_map =
       detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
   auto& op_meta_info_map = get_op_meta_info_map();
-
   RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle);
+  return op_meta_info_map.GetMap();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 4310b564371822d0238a55b9091f524d8d419966..fef1e82a14fe6e03de40c8376f922f87f64564f8 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -20,9 +20,9 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-
 // Load custom op api: register op after user compiled
-void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 
 // Register custom op api: register op directly
 void RegisterOperatorWithMetaInfoMap(
@@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap(
 // Interface for selective register custom op.
 void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
                                   void* dso_handle = nullptr);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 66dfb81755f1c9cc16ab8a52df429af8d94ab718..948eaab40b4f64f2a87a83fab80d4eade5288e91 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass)
+    fix_op_run_order_pass fuse_gemm_epilogue_pass)
 
 if (WITH_CINN)
   set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index c99200ec98aa8f0736610f659d3b94e3c2f1e023..fdf74d2f769fcdd49da19c0118a23d6b8fbb06e4 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
+
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
+                        "fuse_gemm_epilogue_pass");
+#endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
     // for single card training, fuse_all_reduce_ops is unnecessary.
@@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass);
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+USE_PASS(fuse_gemm_epilogue_pass);
+#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 70a083dd70bc3b48bf24b050673f3da7b69b1755..5eb584aaefa981ab6c6f25df7a765ae9a3d0194a 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -124,6 +125,8 @@ struct BuildStrategy {
   paddle::optional<bool> fuse_broadcast_ops_{paddle::none};
   // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
+  // Fuse GEMM+Epilogue via cublasLt epilogue.
+  bool fuse_gemm_epilogue_{false};
 
   // mkldnn_enabled_op_types specify the operator type list to
   // use MKLDNN acceleration. It is null in default, means
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index b7cb2ce0f0102bd34940864960118f396c5dcad7..59220fc9cdaf1f05f70e8cfe961071c1fad3a760 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -186,45 +186,63 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
   container_->prefetch(cudaCpuDeviceId, stream);
+  std::vector<std::thread> threads;
   size_t num = container_->size();
   KeyType unuse_key = std::numeric_limits<KeyType>::max();
   thrust::pair<KeyType, ValType>* kv = container_->data();
-  for (size_t i = 0; i < num; ++i) {
-    if (kv[i].first == unuse_key) {
-      continue;
-    }
-    ValType& gpu_val = kv[i].second;
+
+  int thread_num = 8;
+  int len_per_thread = num / thread_num;
+  int remain = num % thread_num;
+  int begin = 0;
+
+  auto dump_func = [unuse_key, kv](int left, int right) {
+    for (int i = left; i < right; i++) {
+      if (kv[i].first == unuse_key) {
+        continue;
+      }
+      ValType& gpu_val = kv[i].second;
 #ifdef PADDLE_WITH_PSLIB
-    auto* downpour_value =
-        (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
-    int downpour_value_size = downpour_value->size();
-    if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
-      downpour_value->resize(gpu_val.mf_size + downpour_value_size);
-    }
-    float* cpu_val = downpour_value->data();
-    // cpu_val[0] = 0;
-    cpu_val[1] = gpu_val.delta_score;
-    cpu_val[2] = gpu_val.show;
-    cpu_val[3] = gpu_val.clk;
-    cpu_val[4] = gpu_val.lr;
-    cpu_val[5] = gpu_val.lr_g2sum;
-    cpu_val[6] = gpu_val.slot;
-    if (gpu_val.mf_size > 0) {
-      for (int x = 0; x < gpu_val.mf_size; x++) {
-        cpu_val[x + 7] = gpu_val.mf[x];
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
+        downpour_value->resize(gpu_val.mf_size + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      // cpu_val[0] = 0;
+      cpu_val[1] = gpu_val.delta_score;
+      cpu_val[2] = gpu_val.show;
+      cpu_val[3] = gpu_val.clk;
+      cpu_val[4] = gpu_val.lr;
+      cpu_val[5] = gpu_val.lr_g2sum;
+      cpu_val[6] = gpu_val.slot;
+      if (gpu_val.mf_size > 0) {
+        for (int x = 0; x < gpu_val.mf_size; x++) {
+          cpu_val[x + 7] = gpu_val.mf[x];
+        }
       }
-    }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-    auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
-    downpour_value->count_ = gpu_val.show;
-    for (int x = 0; x < gpu_val.mf_size; x++) {
-      downpour_value->data_[x] = gpu_val.mf[x];
-    }
+      auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
+      downpour_value->count_ = gpu_val.show;
+      for (int x = 0; x < gpu_val.mf_size; x++) {
+        downpour_value->data_[x] = gpu_val.mf[x];
+      }
 #endif
+    }
+  };
+
+  for (int i = 0; i < thread_num; i++) {
+    threads.push_back(std::thread(
+        dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0)));
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
   }
 
-  container_->prefetch(devid, stream);
+  // container_->prefetch(devid, stream);
 }
 
 template <typename KeyType, typename ValType>
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 9f2bdeffecf62764f5cbe5bea9cb50d4830be43b..c1f8041cc1eca34b858608ffb77598ce095d0b4f 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -231,19 +231,19 @@ void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
 CustomStreamGarbageCollector::CustomStreamGarbageCollector(
     const platform::CustomPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {
-  platform::DeviceGuard guard(place);
-  stream_.reset(new platform::stream::Stream);
+  phi::DeviceGuard guard(place);
+  stream_.reset(new phi::stream::Stream);
   stream_->Init(place);
-  callback_manager_.reset(new platform::CallbackManager(stream_.get()));
+  callback_manager_.reset(new phi::CallbackManager(stream_.get()));
 }
 
 CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
-  platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
+  phi::DeviceGuard guard(this->dev_ctx_->GetPlace());
   stream_->Synchronize();
   stream_->Destroy();
 }
 
-platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
+phi::stream::Stream *CustomStreamGarbageCollector::stream() const {
   return stream_.get();
 }
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index a67860c6087e0f173e09d2a7c131703260c562fd..f0027c676050b8c31c0bc0ca4ab3b6444f29e1a2 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -230,14 +230,14 @@ class CustomStreamGarbageCollector : public GarbageCollector {
 
   void Wait() const override;
 
-  platform::stream::Stream *stream() const;
+  phi::stream::Stream *stream() const;
 
  protected:
   void ClearCallback(const std::function<void()> &callback) override;
 
  private:
-  std::unique_ptr<platform::stream::Stream> stream_;
-  std::unique_ptr<platform::CallbackManager> callback_manager_;
+  std::unique_ptr<phi::stream::Stream> stream_;
+  std::unique_ptr<phi::CallbackManager> callback_manager_;
 };
 #endif
 
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 57fb68e80427afa56372bebb31ff5822135858b6..b1d7059f311cd370a40e83d7b0016d5af8cdb163 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -90,6 +90,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
 
   bool IsForInferShape() const override { return true; }
 
+  bool IsRuntime() const override { return ctx_.IsRuntime(); }
+
  private:
   const InferShapeContext& ctx_;
 };
@@ -232,16 +234,8 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
-  void share_meta(const MetaTensor& meta_tensor) override {
+  void share_dims(const MetaTensor& meta_tensor) override {
     set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
-    // special case 1: share lod of LoDTensor
-    share_lod(meta_tensor);
-
-    // special case 2: share height and rows of SelectedRows in runtime
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       if (var->IsType<phi::SelectedRows>()) {
@@ -254,6 +248,16 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
+  void share_meta(const MetaTensor& meta_tensor) override {
+    share_dims(meta_tensor);
+    set_dtype(meta_tensor.dtype());
+    // VarDesc doesn't contains layout, so we cannot share layout
+    // set_layout(meta_tensor.layout());
+
+    // special case: share lod of LoDTensor
+    share_lod(meta_tensor);
+  }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
@@ -293,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature;
 
   // 2. build infermeta context
-  phi::InferMetaContext infer_meta_context(ctx->IsRuntime());
+  phi::InferMetaContext infer_meta_context(
+      {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()});
 
   auto& input_names = std::get<0>(signature.args);
   auto& attr_names = std::get<1>(signature.args);
@@ -381,6 +386,10 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             std::type_index(typeid(std::vector<int32_t>))) {
           infer_meta_context.EmplaceBackAttr(std::move(
               phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int64_t>))) {
+          infer_meta_context.EmplaceBackAttr(std::move(
+              phi::ScalarArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
         } else if (std::type_index(attr.type()) ==
                    std::type_index(typeid(int))) {
           infer_meta_context.EmplaceBackAttr(
@@ -491,8 +500,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
-    } else {
-      // do nothing
+    } else if (ctx->HasInput(attr_name)) {
+      // convert from data
+      if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) {
+        if (ctx->IsRuntime()) {
+          const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name);
+          auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
+          auto val = experimental::MakePhiScalarFromVar(*var_temp);
+          int32_t val_int = val.template to<int32_t>();
+          infer_meta_context.EmplaceBackAttr(val_int);
+        } else {
+          infer_meta_context.EmplaceBackAttr(-1);
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Get value from variable only support int yet"));
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 64c8371d583ffef621e5009504d14308dd7b997c..b692b6ffab08014f7de6ef4e5488445204396853 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -29,7 +29,7 @@ namespace framework {
 phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
                                             const std::string& op_type);
 
-#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
+#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
   struct functor_name : public paddle::framework::InferShapeBase {  \
     void operator()(                                                \
         paddle::framework::InferShapeContext* ctx) const override { \
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 53dcc19fcbae88ab5ccfcc498037327946029927..2eeefb19a1aa8c5c9e4f92ff06618c719bb30785 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel(
 }  // namespace framework
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
+DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
                             InferShapeUtilsTestInferShapeFunctor,
-                            PT_INFER_META(paddle::framework::TestInferMeta));
+                            PD_INFER_META(paddle::framework::TestInferMeta));
 REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOp,
                   paddle::framework::InferShapeUtilsTestOpMaker,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0d53a54ff822ae4dde9fcba7c2559569c7e1bd4f..623c8a048c2417ab51772c55b681031d9bcfd925 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -126,6 +126,7 @@ if(WITH_MKLDNN)
     pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
     pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
     pass_library(cpu_quantize_pass inference DIR mkldnn)
     pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
@@ -157,6 +158,7 @@ endif()
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f48224cbdc24fe9706a3c4eae029c6dc35381ad2
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -0,0 +1,471 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h"
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const {
+  EpiloguePassActivationCache cache;
+
+  graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache);
+  graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache);
+  graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache);
+  graph = FuseLinearFwd(graph, false);
+  graph = FuseLinearFwd(graph, true);
+  graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache);
+  graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache);
+  graph = FuseLinearBwd(graph, false);
+  graph = FuseLinearBwd(graph, true);
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
+                                               bool is_training) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, {}, is_training, false);
+
+  int found_linear_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    std::string activation = "none";
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, ele_out);
+
+    GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name();
+    found_linear_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types,
+    bool is_training, bool is_act_grad_x_from_act,
+    EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act);
+
+  int found_linear_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    auto activation = act_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, act_out);
+
+    // Only need to check weight.shape[1] for auxiliary pointer
+    // and mark it the act op is fused for backward epilogue fusion.
+    // That because cuBlasLt epilogue's restriction.
+    if (is_training) {
+      int divisor_of_n = activation == "relu" ? 128 : 8;
+      if (matmul_w_shape[1] % divisor_of_n) return;
+
+      VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace"));
+      auto *reserve_space_node = g->CreateVarNode(&reserve_space);
+
+      cache->InsertFusedActivation(
+          GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()),
+          reserve_space_node);
+
+      gemm_epilogue_node->Op()->SetOutput("ReserveSpace",
+                                          {reserve_space_node->Name()});
+
+      if (!is_act_grad_x_from_act) {
+        GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern);
+        act_grad_op->Op()->RenameInput(ele_out->Name(),
+                                       reserve_space_node->Name());
+        IR_NODE_LINK_TO(reserve_space_node, act_grad_op);
+      }
+      IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node);
+    }
+
+    GraphSafeRemoveNodes(g,
+                         {matmul_op, matmul_out, ele_add_op, ele_out, act_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> "
+            << act_out->Name();
+    found_linear_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
+                                               bool without_x_gradient) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+
+    Node *matmul_grad_dx = nullptr;
+    if (!without_x_gradient) {
+      GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx,
+                                ele_add_matmul_act_pattern);
+      matmul_grad_dx = matmul_grad_dx_ptr;
+    }
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    std::string activation_grad = "none";
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    if (matmul_grad_dx) {
+      fused_gemm_epilogue_grad_op_desc.SetOutput("DX",
+                                                 {matmul_grad_dx->Name()});
+    }
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    if (matmul_grad_dx) {
+      IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx);
+    }
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op});
+
+    std::string matmul_grad_dx_name =
+        matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " ";
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_w->Name() << " and " << matmul_grad_dx_name;
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+    bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, act_grad_types, false,
+                             is_act_grad_x_from_act);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx,
+                              ele_add_matmul_act_pattern);
+
+    auto key =
+        GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId());
+    if (!cache->HasFusedActivation(key)) {
+      return;
+    }
+    auto *reserve_space_node = cache->GetFusedActivationSpace(key);
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    auto activation_grad = act_grad_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace",
+                                              {reserve_space_node->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node);
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op,
+                             matmul_grad_dx, act_grad_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name()
+            << "\n\t " << matmul_grad_dx->Name() << " -> "
+            << act_grad_op->Name() << " -> " << act_grad_dx->Name();
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+bool FuseGemmEpiloguePass::IsGemmFromLinear_(
+    const std::vector<int64_t> &x_shape, const std::vector<int64_t> &w_shape,
+    OpDesc *matmul_v2_op) const {
+  if (w_shape.size() != 2 || x_shape.size() < 2) return false;
+  for (auto attr_name :
+       {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y",
+        "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) {
+    if (matmul_v2_op->HasAttr(attr_name)) {
+      std::vector<int> tmp_vec =
+          BOOST_GET_CONST(std::vector<int>, matmul_v2_op->GetAttr(attr_name));
+      if (tmp_vec.size() > 0) return false;
+    }
+  }
+  if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) ||
+      BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y")))
+    return false;
+
+  return true;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_gemm_epilogue_pass,
+              paddle::framework::ir::FuseGemmEpiloguePass);
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..575ffee73d60e9bd5d4f5af7538d01789268cc9a
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the ElewiseAdd and activation
+ */
+class Graph;
+class Node;
+
+class EpiloguePassActivationCache {
+ public:
+  EpiloguePassActivationCache() {}
+
+  EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete;
+  void operator=(const EpiloguePassActivationCache &) = delete;
+
+  bool HasFusedActivation(const std::string &key) const {
+    return fused_activation_space_map_.count(key);
+  }
+
+  ir::Node *GetFusedActivationSpace(const std::string &key) {
+    if (HasFusedActivation(key)) {
+      return fused_activation_space_map_.find(key)->second;
+    }
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The key (%d) of EpiloguePassActivationCache does not exist.", key));
+  }
+
+  void InsertFusedActivation(const std::string &key, ir::Node *const value) {
+    if (!HasFusedActivation(key)) {
+      mtx.lock();
+      fused_activation_space_map_.insert({key, value});
+      mtx.unlock();
+    } else {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "The key (%d) of EpiloguePassActivationCache already exist.", key));
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, ir::Node *> fused_activation_space_map_;
+  std::mutex mtx;
+};
+
+class FuseGemmEpiloguePass : public FusePassBase {
+ public:
+  virtual ~FuseGemmEpiloguePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const;
+  ir::Graph *FuseLinearActFwd(ir::Graph *graph,
+                              const std::unordered_set<std::string> &act_types,
+                              bool is_training, bool is_act_grad_x_from_act,
+                              EpiloguePassActivationCache *cache) const;
+  ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const;
+  ir::Graph *FuseLinearActBwd(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+      bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const;
+
+ private:
+  bool IsGemmFromLinear_(const std::vector<int64_t> &x_shape,
+                         const std::vector<int64_t> &w_shape,
+                         OpDesc *matmul_v2_op) const;
+  const std::string GetReserveSpaceCacheKey(const std::string var_name,
+                                            int block_id) const {
+    return std::to_string(block_id) + var_name;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index e4c9dc72128f4850b2e0e4af739fdd381e4a3b1e..18068e22b7f3c31d59636bc7ab6a234e109d5ee6 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()(
   return activation_out_var;
 }
 
+PDNode *patterns::ElementwiseActivation::operator()(
+    paddle::framework::ir::PDNode *elementwise_a,
+    const std::string &elementwise_type, const std::string &activation_type) {
+  // Create Operators
+  elementwise_a->assert_is_op_input(elementwise_type, "X");
+  auto *elementwise_op =
+      pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type);
+  auto *activation_op =
+      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
+  // Create variables
+  auto *elementwise_b = pattern->NewNode(elementwise_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input(elementwise_type, "Y");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsIntermediate()
+          ->assert_is_only_output_of_op(elementwise_type)
+          ->assert_is_op_input(activation_type);
+  // output
+  auto *activation_out_var = pattern->NewNode(activation_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output(activation_type);
+
+  elementwise_op->LinksFrom({elementwise_a, elementwise_b})
+      .LinksTo({elementwise_out_var});
+  activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var});
+  return activation_out_var;
+}
+
 PDNode *patterns::SeqConvEltAddRelu::operator()(
     paddle::framework::ir::PDNode *seqconv_input) {
   // Create Operators
@@ -1461,31 +1491,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()(
   return bn_grad;
 }
 
-PDNode *patterns::ElewiseAddAct::operator()(
-    paddle::framework::ir::PDNode *ele_x_var,
-    std::unordered_set<std::string> act_types) {
-  auto *ele_y_var = pattern->NewNode(ele_y_repr())
-                        ->assert_is_op_input("elementwise_add", "Y");
-
-  auto *ele_add =
-      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
-
-  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
-                          ->assert_is_op_output("elementwise_add", "Out");
-
-  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
-
-  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
-
-  auto *act_out_var =
-      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
-
-  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
-  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
-
-  return act_out_var;
-}
-
 PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
     paddle::framework::ir::PDNode *d_act_out_var,
     std::unordered_set<std::string> act_types) {
@@ -1526,6 +1531,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
+PDNode *patterns::ElewiseAddAct::operator()(
+    paddle::framework::ir::PDNode *ele_x_var,
+    std::unordered_set<std::string> act_types) {
+  auto *ele_y_var = pattern->NewNode(ele_y_repr())
+                        ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+
+  auto *act_out_var =
+      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
+
+  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
+  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+  return act_out_var;
+}
+
+PDNode *patterns::LinearAct::operator()(
+    paddle::framework::ir::PDNode *linear_x_var,
+    const std::unordered_set<std::string> &act_types, bool with_grad_link,
+    bool is_act_grad_x_from_act) {
+  auto *matmul_w_var =
+      pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+
+  auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2");
+
+  auto *matmul_out_var = pattern->NewNode(matmul_out_repr())
+                             ->assert_is_op_output("matmul_v2", "Out");
+
+  matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X");
+
+  auto *ele_bias_var = pattern->NewNode(ele_bias_repr())
+                           ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var});
+  ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var});
+
+  if (with_grad_link) {
+    matmul_out_var->assert_is_op_input("elementwise_add_grad", "X");
+    auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad")
+                                        ->assert_is_op("elementwise_add_grad");
+    elementwise_add_grad_op->LinksFrom({matmul_out_var});
+  }
+
+  if (act_types.size() > 0) {
+    ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+    auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+    auto *act_out_var = pattern->NewNode(act_out_repr())
+                            ->assert_is_ops_output(act_types, "Out");
+
+    act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+    if (with_grad_link && !is_act_grad_x_from_act) {
+      std::unordered_set<std::string> act_grad_types;
+      for (const auto &act : act_types) {
+        std::string act_grad(act);
+        act_grad.append("_grad");
+        act_grad_types.insert(act_grad);
+      }
+
+      ele_out_var->assert_is_ops_input(act_grad_types, "X");
+      auto *act_grad_op =
+          pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+      act_grad_op->LinksFrom({ele_out_var});
+    }
+
+    return act_out_var;
+  }
+
+  return ele_out_var;
+}
+
+PDNode *patterns::ElewiseAddMatmulAct::operator()(
+    paddle::framework::ir::PDNode *dout_var,
+    const std::unordered_set<std::string> &act_grad_types,
+    bool without_x_gradient, bool is_act_grad_x_from_act) {
+  auto *ele_grad_bias_var =
+      pattern->NewNode(ele_grad_bias_repr())
+          ->assert_is_op_input("elementwise_add_grad", "Y");
+  auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr())
+                           ->assert_is_op("elementwise_add_grad");
+  auto *ele_grad_dx_var =
+      pattern->NewNode(ele_grad_dx_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
+  auto *ele_grad_dbias_var =
+      pattern->NewNode(ele_grad_dbias_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("Y"));
+  ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var})
+      .LinksTo({ele_grad_dx_var, ele_grad_dbias_var});
+
+  ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad",
+                                                        GradVarName("Out"));
+
+  auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "X");
+  auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "Y");
+  auto *matmul_grad =
+      pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad");
+  auto *matmul_grad_dx_var =
+      pattern->NewNode(matmul_grad_dx_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("X"));
+  auto *matmul_grad_dw_var =
+      pattern->NewNode(matmul_grad_dw_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("Y"));
+  matmul_grad->LinksFrom(
+      {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var});
+  if (without_x_gradient) {
+    matmul_grad->LinksTo({matmul_grad_dw_var});
+  } else {
+    matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var});
+  }
+
+  if (!without_x_gradient && act_grad_types.size() > 0) {
+    matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input(
+        act_grad_types, GradVarName("Out"));
+
+    auto *act_grad =
+        pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+    auto *act_grad_dx_var =
+        pattern->NewNode(act_grad_dx_repr())
+            ->assert_is_ops_output(act_grad_types, GradVarName("X"));
+
+    auto *act_grad_x_var = matmul_grad_x_var;
+    if (!is_act_grad_x_from_act) {
+      auto *ele_out_var = pattern->NewNode(ele_out_repr())
+                              ->assert_is_ops_input(act_grad_types, "X");
+      act_grad_x_var = ele_out_var;
+    }
+
+    act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var})
+        .LinksTo({act_grad_dx_var});
+    return act_grad;
+  }
+
+  return matmul_grad;
+}
+
 // conv_type: conv2d, conv3d, conv2d_transpose
 PDNode *patterns::ConvBias::operator()(
     paddle::framework::ir::PDNode *conv_input, std::string conv_type) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d6400ed6945bf8a60c1d4f357bf58a11d5b87094..062d2f9dedce65f6e16b70f0b201a4ca63b0531a 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase {
   PATTERN_DECL_NODE(activation_out);
 };
 
+// Elementwise with Activation
+// op: elementwise + activation
+// named nodes:
+// elementwise_a, elementwise_b,
+// elementwise_out, elementwise,
+// activation_out, activation
+struct ElementwiseActivation : public PatternBase {
+  ElementwiseActivation(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise_add_activation") {}
+
+  PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type,
+                     const std::string& activation_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(elementwise);
+  PATTERN_DECL_NODE(activation);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elementwise_b);
+  PATTERN_DECL_NODE(elementwise_out);
+  PATTERN_DECL_NODE(activation_out);
+};
+
 // SEQCONV with Elementwise_Add ReLU
 // op: seqconv + elementwise_add + relu
 // named nodes:
@@ -863,6 +885,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(ele_y);
 };
 
+// The following patterns are used to fuse linear and act (ReLu or GeLU)
+// formula: act(F.linear(x))
+// op: matmul_v2 + elementwise_add + act
+// named nodes: matmul, elementwise_add, act
+//              matmul_w, matmul_out
+//              ele_bias, elewise_add_out, act_out
+struct LinearAct : public PatternBase {
+  LinearAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "linear_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_types,
+                     bool with_grad_link, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(matmul);
+  PATTERN_DECL_NODE(ele_add);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(matmul_w);
+  PATTERN_DECL_NODE(matmul_out);
+  PATTERN_DECL_NODE(elewise_add_out);
+  PATTERN_DECL_NODE(ele_bias);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// The following patterns are used to fuse linear_grad and act_grad (ReLu or
+// GeLU)
+// formula: the backward of F.linear( act(x) )
+// op: elementwise_add_grad + matmul_v2_grad + act_grad
+// named nodes: ele_add_grad, matmul_grad, act_grad
+//              ele_grad_bias, ele_grad_dx, ele_grad_dbias
+//              matmul_grad_x, matmul_grad_dx, matmul_grad_dx
+//              matmul_grad_dw, act_grad_dx
+struct ElewiseAddMatmulAct : public PatternBase {
+  ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_grad_types,
+                     bool without_x_gradient, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_add_grad);
+  PATTERN_DECL_NODE(matmul_grad);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(ele_out);
+  PATTERN_DECL_NODE(ele_grad_bias);
+  PATTERN_DECL_NODE(ele_grad_dx);
+  PATTERN_DECL_NODE(ele_grad_dbias);
+  PATTERN_DECL_NODE(matmul_grad_x);
+  PATTERN_DECL_NODE(matmul_grad_w);
+  PATTERN_DECL_NODE(matmul_grad_dx);
+  PATTERN_DECL_NODE(matmul_grad_dw);
+  PATTERN_DECL_NODE(act_grad_dx);
+};
+
 // Conv with Elementwise_add as bias
 // op: conv + elementwise_add
 // named nodes:
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index d33dc7f49feb0f4c9e585d13186d65b6c2d618c0..636a594a657cb0744aac161d928ff9078b1f92bc 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -20,12 +20,15 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(elementwise_add_grad);
 
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+
 DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index c537d05738529dcb885e86cbcabf4405fd75270b..2403e60df3918394e99c3284b2a417e336fc3bae 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
@@ -135,157 +136,9 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .End();
 }
 
-ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
-        get_node_from_conv_op,
-    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_op{get_node_from_conv_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_op;
-  Node* conv_input;
-  Node* conv_filter;
-  Node* conv_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_identity;
-  Node* elementwise_add_out;
-
-  std::tie(conv_op, conv_input, conv_filter, conv_output) =
-      get_node_from_conv_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
-
-  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
-
-  if (HasFusedActivation(conv_op)) return;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
-
-  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-  conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
-    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_x_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
-        get_node_from_conv_y_op,
-    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
-        get_node_from_elementwise_add_op,
-    const ResidualConnectionMKLDNNFusePass* pass)
-    : fusion_stats{std::make_shared<int>(0)},
-      can_fuse_func{can_fuse_func},
-      get_node_from_conv_x_op{get_node_from_conv_x_op},
-      get_node_from_conv_y_op{get_node_from_conv_y_op},
-      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op},
-      pass_{pass} {}
-
-void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
-    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-  Node* conv_x_op;
-  Node* conv_x_input;
-  Node* conv_x_filter;
-  Node* conv_x_output;
-
-  Node* conv_y_op;
-  Node* conv_y_input;
-  Node* conv_y_filter;
-  Node* conv_y_output;
-
-  Node* elementwise_add_op;
-  Node* elementwise_add_out;
-
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
-
-  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
-      get_node_from_conv_x_op(subgraph);
-  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
-      get_node_from_conv_y_op(subgraph);
-  std::tie(elementwise_add_op, elementwise_add_out) =
-      get_node_from_elementwise_add_op(subgraph);
-
-  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
-  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
-
-  Node* projection_node;
-  Node* residual_conv_op;
-  Node* residual_conv_output;
-
-  if (IsReachable(graph, conv_x_input, conv_y_output)) {
-    projection_node = conv_x_output;
-    residual_conv_op = conv_y_op;
-    residual_conv_output = conv_y_output;
-  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
-    projection_node = conv_y_output;
-    residual_conv_op = conv_x_op;
-    residual_conv_output = conv_x_output;
-  } else {
-    return;
-  }
-
-  if (HasFusedActivation(residual_conv_op)) return;
-
-  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
-
-  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
-
-  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
-
-  IR_NODE_LINK_TO(projection_node, residual_conv_op);
-  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
-
-  (*fusion_stats)++;
-}
-
-std::tuple<Node*, Node*, Node*, Node*>
-ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
-    const patterns::Conv& conv_pattern,
-    const GraphPatternDetector::subgraph_t& subgraph) const {
-  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
-
-  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
-}
-
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
     const std::string& name_scope,
     const GraphWithStats& graph_with_stats) const {
-  ir::Graph* graph;
-  int stats;
-
-  std::tie(graph, stats) = graph_with_stats;
-
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
 
@@ -298,26 +151,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
       pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_y,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_conv_as_x_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_add_identity, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+
+    found_conv_as_x_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_x_count
+           << " conv (as x) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_x_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
@@ -335,26 +218,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
       conv_output);
   conv_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_x,
-                               elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<IdentityFuseHandle>(
-      &gpd, graph_with_stats,
-      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_conv_as_y_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    if (!IsReachable(g, elementwise_add_x, conv_output)) return;
+
+    if (HasFusedActivation(conv_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(elementwise_add_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+
+    found_conv_as_y_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_conv_as_y_count
+           << " conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_conv_as_y_count + graph_with_stats.second);
 }
 
 GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
@@ -374,39 +287,84 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
-  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
-      const GraphPatternDetector::subgraph_t& subgraph)
-      -> std::tuple<Node*, Node*> {
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                                  elementwise_add_pattern);
-        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                                  elementwise_add_pattern);
-
-        return std::make_tuple(elementwise_add_op, elementwise_add_out);
-      };
-
-  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
-      &gpd, graph_with_stats,
-      [this,
-       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_x_pattern, subgraph);
-      },
-      [this,
-       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
-        return GetNodesFromConv(conv_y_pattern, subgraph);
-      },
-      get_node_from_elementwise_add, this);
+  int found_projection_conv_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+      return;
+    }
+
+    if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    Node* projection_node;
+    Node* residual_conv_op;
+    Node* residual_conv_output;
+    if (IsReachable(g, conv_x_input, conv_y_output)) {
+      projection_node = conv_x_output;
+      residual_conv_op = conv_y_op;
+      residual_conv_output = conv_y_output;
+    } else if (IsReachable(g, conv_y_input, conv_x_output)) {
+      projection_node = conv_y_output;
+      residual_conv_op = conv_x_op;
+      residual_conv_output = conv_x_output;
+    } else {
+      return;
+    }
+
+    if (HasFusedActivation(residual_conv_op)) return;
+
+    residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+
+    residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op});
+
+    IR_NODE_LINK_TO(projection_node, residual_conv_op);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+
+    found_projection_conv_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    msg_ss << "---    Fused " << found_projection_conv_count
+           << " projection conv (as y) + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_projection_conv_count + graph_with_stats.second);
 }
 
-void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
-  auto fused_graph_with_stats = FuseConvAsY(
-      name_scope_,
-      FuseConvAsX(name_scope_,
-                  FuseProjectionConv(name_scope_, std::make_pair(graph, 0))));
+  auto graph_with_stats =
+      FuseProjectionConv(name_scope_, std::make_pair(graph, 0));
+  graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats);
+  graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats);
 
-  LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n";
-  AddStatis(fused_graph_with_stats.second);
+  AddStatis(graph_with_stats.second);
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index c83335da2f629c128fcf4819b2645ab1ef5eae42..c4351b382187d9062a059d013ddb237520645b6d 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,19 +28,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class Graph;
-class GraphPatternDetector;
-class Node;
-namespace patterns {
-struct Conv;
-}  // namespace patterns
-
-using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
-paddle::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
  private:
@@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
       const std::string& name_scope,
       const GraphWithStats& graph_with_stats) const;
 
-  template <typename RetType>
-  using GetNodeFunc =
-      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
-  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
-  using IdentityElementwiseAddFunc =
-      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
-
-  using ProjectionConvFunc = IdentityConvFunc;
-  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
-
-  using CanFuseFunc = std::function<bool(Node*, Node*)>;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
-      const patterns::Conv& conv_pattern,
-      const GraphPatternDetector::subgraph_t& subgraph) const;
-
-  template <typename HandleType, typename... OpFuncs>
-  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
-                                      const GraphWithStats& graph_with_stats,
-                                      OpFuncs&&... op_funcs) const {
-    ir::Graph* graph;
-    int stats;
-
-    std::tie(graph, stats) = graph_with_stats;
-
-    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
-      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
-    };
-    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
-
-    (*gpd)(graph, fuse_handle);
-
-    return std::make_pair(graph, stats + fuse_handle.get_stats());
-  }
-
-  struct IdentityFuseHandle {
-    IdentityFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const IdentityConvFunc& get_node_from_conv_op,
-        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    IdentityConvFunc get_node_from_conv_op;
-    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
-  struct ProjectionFuseHandle {
-    ProjectionFuseHandle(
-        const CanFuseFunc& can_fuse_func,
-        const ProjectionConvFunc& get_node_from_conv_x_op,
-        const ProjectionConvFunc& get_node_from_conv_y_op,
-        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op,
-        const ResidualConnectionMKLDNNFusePass* pass);
-
-    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
-                    Graph* graph);
-    int get_stats() const { return *fusion_stats; }
-
-   private:
-    std::shared_ptr<int> fusion_stats;
-    CanFuseFunc can_fuse_func;
-    ProjectionConvFunc get_node_from_conv_x_op;
-    ProjectionConvFunc get_node_from_conv_y_op;
-    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
-    const ResidualConnectionMKLDNNFusePass* pass_;
-  };
-
  public:
   ResidualConnectionMKLDNNFusePass();
   virtual ~ResidualConnectionMKLDNNFusePass() {}
 
  protected:
-  void ApplyImpl(graph_ptr graph) const;
+  void ApplyImpl(ir::Graph* graph) const;
+
   static bool HasFusedActivation(Node* conv_node) {
     return !(conv_node->Op()
                  ->GetAttrIfExists<std::string>("fuse_activation")
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b7f7a8071d21413f45d86e98b8649a3aaba5d2f5
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const {
+  std::vector<std::string> act_types = {
+      "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt",
+      "abs",  "clip", "gelu",       "relu6", "sigmoid"};
+  std::vector<std::string> elt_types = {"elementwise_add", "elementwise_sub",
+                                        "elementwise_mul"};
+
+  for (const auto &elt_type : elt_types)
+    for (const auto &act_type : act_types) {
+      std::unordered_map<std::string, std::string> attr_map;
+
+      if (act_type == "swish")
+        attr_map.emplace("beta", "activation_alpha");
+      else if (act_type == "relu6")
+        attr_map.emplace("threshold", "activation_alpha");
+      else if (act_type == "clip") {
+        attr_map.emplace("min", "activation_alpha");
+        attr_map.emplace("max", "activation_beta");
+      } else {
+        attr_map.emplace("alpha", "activation_alpha");
+        attr_map.emplace("beta", "activation_beta");
+      }
+      FuseElementwiseAct(graph, elt_type, act_type, attr_map);
+    }
+}
+
+void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
+    Graph *graph, const std::string &elt_type, const std::string &act_type,
+    const std::unordered_map<std::string, std::string> &attr_map) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init("elementwise_act", graph);
+
+  GraphPatternDetector gpd;
+  auto *elementwise_input = gpd.mutable_pattern()
+                                ->NewNode(elt_type + "_act/elementwise_input")
+                                ->AsInput()
+                                ->assert_is_op_input(elt_type, "X");
+  patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(),
+                                                          elt_type + "_act");
+  elementwise_act_pattern(elementwise_input, elt_type, act_type);
+
+  int found_elementwise_activation_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Fuse " << elt_type << " with activation op.";
+    // Elementwise output
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_act_pattern);
+    // ACT output
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out,
+                              elementwise_act_pattern);
+    // ops
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise,
+                              elementwise_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern);
+
+    auto *elementwise_op = elementwise->Op();
+
+    if (elementwise_op->HasAttr("use_mkldnn")) {
+      const std::string wo_elt_type =
+          "The " + elt_type;  // Workaround for PP error message checking.
+      PADDLE_ENFORCE_EQ(
+          BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true,
+          platform::errors::PreconditionNotMet(
+              wo_elt_type + "+Act fusion may happen only when oneDNN library "
+                            "is used."));
+    }
+
+    auto *activation_op = activation->Op();
+    for (const auto &attr : attr_map) {
+      if (activation_op->HasAttr(attr.first)) {
+        elementwise_op->SetAttr(attr.second,
+                                activation_op->GetAttr(attr.first));
+      }
+    }
+
+    if (act_type == "gelu" && activation_op->HasAttr("approximate") &&
+        BOOST_GET_CONST(bool, activation_op->GetAttr("approximate")))
+      elementwise_op->SetAttr("activation_type", std::string("gelu_tanh"));
+    else
+      elementwise_op->SetAttr("activation_type", act_type);
+
+    elementwise_op->SetOutput("Out", {activation_out->Name()});
+
+    IR_OP_VAR_LINK(elementwise, activation_out);
+    GraphSafeRemoveNodes(g, {activation, elementwise_out});
+    found_elementwise_activation_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_elementwise_activation_count);
+  PrettyLogDetail("---    fused %d %s with %s activation",
+                  found_elementwise_activation_count, elt_type, act_type);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(elt_act_mkldnn_fuse_pass,
+              paddle::framework::ir::ElementwiseActivationOneDNNPass);
+REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .LE("elementwise_sub", 1)
+            .LE("elementwise_mul", 1)
+            .LE("relu", 0)
+            .LE("tanh", 0)
+            .LE("leaky_relu", 1)
+            .LE("swish", 0)
+            .LE("hard_swish", 0)
+            .LE("sqrt", 0)
+            .LE("abs", 0)
+            .LE("clip", 1)
+            .LE("gelu", 0)
+            .LE("relu6", 0)
+            .LE("sigmoid", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8b7d06a828508e9773301bfc602e01f9354eac4
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * \brief   Fuse the Elementwise and activation operators into single
+ * OneDNN's Elementwise with post-op.
+ */
+class ElementwiseActivationOneDNNPass : public FusePassBase {
+ public:
+  virtual ~ElementwiseActivationOneDNNPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+
+  void FuseElementwiseAct(
+      Graph *graph, const std::string &elt_types, const std::string &act_types,
+      const std::unordered_map<std::string, std::string> &attr_map) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 96aa95bde337436dd6eb584b3eea5395b5301a34..11190309814e7c75777a6cddd7e4d24bfc7ba9e6 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
 #include <string>
+#include <unordered_set>
 
-#include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>
-#include <random>
-#include <unordered_set>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
@@ -25,7 +26,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 0a95444f852dd0abdd150d04dc7536e26151c218..ef2e83ced26e07f199a122ee3157eb428b63aec9 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -15,8 +15,9 @@
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 
 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
 #include <unordered_set>
+
+#include <boost/logic/tribool.hpp>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -24,11 +25,11 @@ USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
 USE_OP(gelu);
-USE_OP(relu);
-USE_OP(tanh);
+USE_OP_ITSELF(relu);
+USE_OP_ITSELF(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 2c3359ffa8e46f0d30a01d73fccb95d88771480a..eadb00b9e88e14075c46a53c711fd43774f26581 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -32,12 +32,12 @@ USE_OP(concat);
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
@@ -46,15 +46,15 @@ USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
-USE_OP(elementwise_mul_grad);
+USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP(sigmoid_grad);
-USE_OP(tanh_grad);
+USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP(slice_grad);
 USE_OP(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
 USE_OP(sgd);
 USE_OP(squared_l2_norm);
 USE_OP(memcpy_h2d);
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 7b3916bafc93eda8cb1afbf54b706e032c5233dd..bc65231abe7371a931f709c9190b55fde24f0543 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -409,7 +409,7 @@ class ThreadPoolTempl {
       return false;
     }
     platform::RecordEvent("SleepWaitForWork",
-                          platform::TracerEventType::UserDefined, 2);
+                          platform::TracerEventType::UserDefined, 10);
     ec_.CommitWait(waiter);
     blocked_--;
     return true;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index c45bf32d8b710cb35ec5f86a4a8ba2e1078537e6..eb40a49b4066a7a8c8e9c142a310b815fd73da20 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
     return 0;                                                            \
   }
 
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class, \
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
+  REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
         paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
         paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6414dd455db4f2e39d958760449e3eb9d7d362f0..f23a266ef03641bc8f8d273b15ab4982e377cb03 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -254,7 +254,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           "reinstall Paddle with CustomDevice support.",
           place));
 #else
-      platform::DeviceManager::SetDevice(place);
+      phi::DeviceManager::SetDevice(place);
 #endif
     }
 
@@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const {
   return var != nullptr;
 }
 
+bool ExecutionContext::HasInputs(const std::string& name) const {
+  const auto& ins = ctx_.inputs;
+  auto it = ins.find(name);
+  if (it == ins.end() || it->second.empty()) {
+    return false;
+  }
+  for (const auto* input : it->second) {
+    if (input == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool ExecutionContext::HasOutput(const std::string& name) const {
   auto* var = OutputVar(name);
   return var != nullptr;
@@ -2106,15 +2120,19 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
-      } else if (var->template IsType<phi::SelectedRows>()) {
-        tensor_out = var->template GetMutable<phi::SelectedRows>();
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported output `%s` type when call pt kernel.",
-            framework::ToTypeName(var->Type())));
+
+      if (var) {
+        if (var->template IsType<framework::LoDTensor>()) {
+          tensor_out = var->template GetMutable<framework::LoDTensor>();
+        } else if (var->template IsType<phi::SelectedRows>()) {
+          tensor_out = var->template GetMutable<phi::SelectedRows>();
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported output `%s` type when call pt kernel.",
+              framework::ToTypeName(var->Type())));
+        }
       }
+
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
 
@@ -2185,41 +2203,109 @@ void OperatorWithKernel::BuildPhiKernelContext(
             std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = Attrs().at(attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
-      auto& attr = Attrs().at(attr_names[i]);
+      auto attr_it = attrs_.find(attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        if (attr_it == attrs_.end()) {
+          auto in_it = ctx.inputs.find(attr_names[i]);
+          if (in_it != ctx.inputs.end()) {
+            // get data from input
+            auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
+            int32_t val_int = val.template to<int32_t>();
+            pt_kernel_context->EmplaceBackAttr(val_int);
+          } else {
+            PADDLE_THROW(platform::errors::NotFound(
+                "can not find attribute `%s` both in attribute and input ",
+                attr_names[i]));
+          }
+        } else {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(int, attr_it->second));
+        }
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(float, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(bool, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(int64_t, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::string, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
         auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
+                BOOST_GET_CONST(int, attr_it->second)));
         pt_kernel_context->EmplaceBackAttr(data_type);
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+        if (std::type_index(attr_it->second.type()) ==
+            std::type_index(typeid(std::vector<int64_t>))) {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
+        } else if (std::type_index(attr_it->second.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+          const auto& vector_int_attr =
+              BOOST_GET_CONST(std::vector<int>, attr_it->second);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
-
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+        const auto& vector_int_attr =
+            BOOST_GET_CONST(std::vector<int>, attr_it->second);
         pt_kernel_context->EmplaceBackAttr(vector_int_attr);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e33d4feb82a9e7a92c3dabea0ccc5fe370afda66..1a1171f1dba4d794796ef1421fe386f60a0e587d 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -295,6 +295,8 @@ class ExecutionContext {
 
   virtual bool HasInput(const std::string& name) const;
 
+  virtual bool HasInputs(const std::string& name) const;
+
   virtual bool HasOutput(const std::string& name) const;
 
   virtual size_t InputSize(const std::string& name) const {
@@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
       : ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
-    return ctx_.HasInput(name);
+    return ctx_.HasInputs(name);
   }
 
   bool HasOutput(const std::string& name) const override {
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index bf9d1baaf394f05d125563311dd2047383373834..47dffd47b7cbbf4a37e6715b40d41024330bc679 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 
 USE_PASS(build_cinn_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(relu_grad);
+USE_OP_ITSELF(relu_grad);
 USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 706815185a1b5b53d1bb8e26274206abc126cfd5..c015e90f71e54691e92c3a36c3d6e053372f64f3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -241,7 +241,6 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
-  options.with_buffer_handle_instruction_inserted = true;
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index e8badab27b9b97aade81bf496ce211485f924757..cdccc4c5546900a141a084281f419c2940b23817 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) {
 USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
index 23cb653fef22ac966655e5650d20c128e2bd3cdd..7a7a7b2798f5920f89e15222959a935da9af2c25 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
@@ -45,8 +45,8 @@ Program CreateAddProgram() {
   NetBuilder builder("net_builder");
   auto a = builder.CreateInput(Float(32), {M, N});
   auto b = builder.CreateInput(Float(32), {M, N});
-  auto c = builder.add(a, b);
-  auto d = builder.add(a, c);
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
   auto program = builder.Build();
 
   return program;
@@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) {
   auto w = builder.CreateInput(Float(32), {N, K}, "W");  // weight
   auto b = builder.CreateInput(Float(32), {N}, "B");     // bias
 
-  auto mul_out = builder.mul(a, w, 2, 1);
-  auto add_out = builder.add(mul_out, b);
+  auto mul_out = builder.Mul(a, w, 2, 1);
+  auto add_out = builder.Add(mul_out, b);
   auto program = builder.Build();
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index e1ce705533ab4ba1c75d8f656683608365e97907..3d8a5ab21f00fcc4137d177b741023a827e325d7 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -33,6 +33,7 @@ if(NOT WIN32)
     endif()
     if(WITH_CNCL)
         cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+	cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
     if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
         cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
@@ -41,7 +42,7 @@ if(NOT WIN32)
 endif(NOT WIN32)
 if(WITH_GLOO)
     cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits)
-    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) ))
+    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) ))
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
 endif()
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 7416d206fc43eaf5a56c3eb606bb0672d1172c0b..d7478b18dba0616fdc995866d8892c7c052a0e35 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type,
 }
 
 void BasicEngine::Execute() {
+  platform::RecordEvent backward_record_event(
+      "backward", platform::TracerEventType::Operator, 1);
+
   if (init_nodes_.empty()) {
     return;
   }
@@ -412,7 +415,7 @@ void BasicEngine::Execute() {
 
     for (auto& cur_op : *shared_cur_node) {
       platform::RecordEvent op_type_record_event(
-          cur_op.Type(), platform::TracerEventType::Operator, 1);
+          cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1);
 
       ++op_num;
 
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index fe5ac73b0046915c4a52087ed792925b0b0ed200..fbc47f81fd33169f54aeb2c251f9b6c90cb44637 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext {
     return (it != var_map_in_.end() && it->second.size() > 0);
   }
 
+  bool HasInputs(const std::string& name) const override {
+    auto it = var_map_in_.find(name);
+    return (it != var_map_in_.end() && it->second.size() > 0);
+  }
+
   bool HasOutput(const std::string& name) const override {
     auto it = var_map_out_.find(name);
     return (it != var_map_out_.end() && it->second.size() > 0);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2317bfdd7c0d5ee94e91e081da47177625f5bfd8..bae49fb381a475dd8227d1dc855a6db28c9cd273 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
+  expected_kernel_key.place_ = platform::XPUPlace();
   bool use_xpu_kp_kernel_rt =
       FLAGS_run_kp_kernel &&
       paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 3b5762720e7fb4a9eb0be157f6dabf07aa9353c2..8deb3b93e9c50489dcfc6805063f23e3705cb634 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
 
-    if ((it == ins.end()) &&
-        (input_defs[i].type_index ==
-         std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
-      kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
-      auto end_idx = start_idx + 1;
-      kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
-      continue;
+    if (it == ins.end()) {
+      if (LIKELY(input_defs[i].type_index ==
+                 std::type_index(
+                     typeid(paddle::optional<const phi::DenseTensor&>)))) {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
+        auto end_idx = start_idx + 1;
+        kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
+        continue;
+      } else {
+        PADDLE_THROW(phi::errors::NotFound(
+            "Can not find input variable '%s' for %s OP, please check whether "
+            "the name setting in OpArgumentMapping is consistent with that in "
+            "OpMaker.",
+            input_names[i], pt_kernel_signature.name));
+      }
     }
+
     auto ins_vector = it->second;
     size_t end_idx = start_idx + ins_vector.size();
 
@@ -314,21 +323,25 @@ void BuildDygraphPhiKernelContext(
 
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset]->MutableVar();
-      if (var->template IsType<phi::DenseTensor>()) {
-        tensor_out = var->template GetMutable<phi::DenseTensor>();
-      } else if (var->template IsType<phi::SelectedRows>()) {
-        tensor_out = var->template GetMutable<phi::SelectedRows>();
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported output `%s` type when call pt kernel.",
-            framework::ToTypeName(var->Type())));
+      if (var) {
+        if (var->template IsType<phi::DenseTensor>()) {
+          tensor_out = var->template GetMutable<phi::DenseTensor>();
+        } else if (var->template IsType<phi::SelectedRows>()) {
+          tensor_out = var->template GetMutable<phi::SelectedRows>();
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported output `%s` type when call pt kernel.",
+              framework::ToTypeName(var->Type())));
+        }
       }
+
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
+    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -406,8 +419,74 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (ins.find(attr_names[i]) != ins.end()) {
+      // deal tensor attr here
+      auto& ins_vector = ins.at(attr_names[i]);
+      auto tensor_attr =
+          experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
+        int val = tensor_attr.template to<int>();
+        kernel_ctx->EmplaceBackAttr(val);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
+      }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<bool>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
+
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
@@ -429,7 +508,11 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          kernel_ctx->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 3a6365b2af21ae9012fe37293699caed9bb23855..fec9afbf3b403ca2fd45633326c7f7dec46e1243 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -31,7 +31,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #ifdef PADDLE_WITH_XPU_BKCL
 // TODO(liuyuhui) support xpu about div nranks in the future
 #endif
+  } else if (platform::is_mlu_place(tensor->place())) {
+    // TODO(zhangna)
+    VLOG(4) << "divnrank for mlu not support yet";
   }
 }
 
@@ -222,6 +225,56 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
+#ifdef PADDLE_WITH_CNCL
+// context is used to select the stream for concat
+template <>
+void ConcatTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    const std::vector<framework::Tensor> &dense_tensors_,
+    framework::Variable *p_dense_contents,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case framework::proto::VarType::FP32:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+
+// context is used to select the stream for split
+template <>
+void SplitTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case framework::proto::VarType::FP32:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+#endif
+
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
@@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't concat npu grads since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    ConcatTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        dense_tensors_, &dense_contents_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat mlu grads since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     ConcatTensorsWithType(
@@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't split npu grad since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    SplitTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        &dense_contents_, &dense_tensors_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split mlu grad since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     SplitTensorsWithType(
@@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
         // TODO(liuyuhui) support XPU set constant
         VLOG(3) << "XPU doesn't support set_constant";
       }
+#elif defined(PADDLE_WITH_CNCL)
+      if (platform::is_mlu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support MLU set constant
+        VLOG(3) << "MLU doesn't support set_constant";
+      }
 #else
       auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
       if (HasGrad(var_index)) {
@@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) {
         cv_.notify_all();
       }
     });
-#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
+#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) ||    \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_CNCL)
     FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Not compiled with BKCL or NCCL or GLOO."));
+        "Not compiled with BKCL or NCCL or CNCL or GLOO."));
 #endif
   }
 }
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index cca773b840c279f05cd6bcd0ed82fda7fdd55a25..9fac4b41cbde01f365dcc603844b06c473a58843 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -45,7 +45,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index e4f1cfdb3baeed9b5945b7843b6593528df48c29..09de0106ed6190c5f627ba9fb7cc038593b5088a 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
 cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
 cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op)
-if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
+if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL)
 cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 6c304278d21fde7af093b25cdd8f62a1d4528d31..5e674af1a08a87c11bfab1080be42e623661b38e 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) {
       value.push_back(static_cast<T>(1.0 * j));
     }
 
-    if (std::is_same<Place, platform::CUDAPlace>::value) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (std::is_same<Place, platform::CUDAPlace>::value ||
+        std::is_same<Place, platform::MLUPlace>::value) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_CNCL)
       paddle::memory::Copy(place, data, cpu_place, value.data(),
                            sizeof(T) * value.size(), 0);
 #endif
@@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) {
 }
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+TEST(TestGroup, TestMLUConcatSplit) {
+  platform::MLUPlace mlu_place(0);
+  platform::CPUPlace cpu_place;
+
+  int size = 3;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+
+  size = 15;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 3ac2028790608529e0745dde2ce41ed57748f46d..02a1689c23a3fe5e1543a2e52d7661d5997bc062 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -24,6 +24,10 @@
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index f5ca13cb99ad3df6b9283565b5681c36f7197ae8..4cda3f32fdf3fdd2d14b201fa902c1f50f3ff98d 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -24,6 +24,13 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
@@ -226,7 +233,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
 }  // namespace paddle
 
 USE_OP_ITSELF(split);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 #ifdef PADDLE_WITH_MKLDNN
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 #endif
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index d05036f7a12ebdc3db5fbfda5eb50c295c0478e4..f754c6fdd0ee7742f0e544baad0225502c172848 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -28,6 +28,14 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
@@ -591,5 +599,5 @@ TEST(test_tracer, eager_tracer) {
 USE_OP(mul);
 USE_OP(mul_grad);
 USE_OP_ITSELF(reduce_sum);
-USE_OP(reduce_sum_grad);
+USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 85bcbd1458f24a592b646dfcda750f37f113f73f..c55599cc9aa954e2bd437f0917c792e4fdb6b577 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -18,12 +18,14 @@
 #include <utility>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/common/place.h"
 
 DECLARE_bool(use_mkldnn);
 DECLARE_string(tracer_mkldnn_ops_on);
@@ -175,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type, platform::TracerEventType::Operator, 1);
+      type + " trace_op", platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -253,7 +255,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 #endif
     } else if (platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-      platform::DeviceManager::SetDevice(place);
+      phi::DeviceManager::SetDevice(place);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with CustomDevice if use "
@@ -295,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
     program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
   }
 
-  if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
-    PADDLE_ENFORCE_EQ(
-        passed_default_attrs_, nullptr,
-        paddle::platform::errors::PermissionDenied(
-            "We expect passed_default_attrs_ is nullptr while "
-            "use_default_attr_map is true, however we got not null "
-            "passed_default_attrs_. Please check your usage of trace_op. "));
-    CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
-                     inplace_map);
-  } else {
-    VLOG(3) << "No Grad to track for Op: " << type;
+  {
+    platform::RecordEvent node_creation_record_event(
+        type + " node_creation", platform::TracerEventType::Operator, 1);
+
+    if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
+      PADDLE_ENFORCE_EQ(
+          passed_default_attrs_, nullptr,
+          paddle::platform::errors::PermissionDenied(
+              "We expect passed_default_attrs_ is nullptr while "
+              "use_default_attr_map is true, however we got not null "
+              "passed_default_attrs_. Please check your usage of trace_op. "));
+      CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
+                       inplace_map);
+    } else {
+      VLOG(3) << "No Grad to track for Op: " << type;
+    }
+    VLOG(6) << "Finish Trace Op: " << type;
   }
-  VLOG(6) << "Finish Trace Op: " << type;
 }
 
 template void Tracer::TraceOp<VarBase>(
@@ -382,5 +389,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
   return false;
 }
 
+phi::KernelSignature Tracer::GetExpectedKernelSignature(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+  auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
+  framework::RuntimeContext ctx({}, {});
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(phi::CPUPlace());
+  const auto& op_info = op->Info();
+  auto* attr_checker = op_info.Checker();
+  if (attr_checker) {
+    attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
+  }
+  static paddle::framework::AttributeMap empty_attrs_map = {};
+  const paddle::framework::AttributeMap& default_attrs =
+      attr_checker == nullptr ? empty_attrs_map
+                              : attr_checker->GetDefaultAttrMap();
+  auto dygraph_exe_ctx =
+      imperative::DygraphExecutionContext<imperative::VarBase>(
+          *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
+          default_attrs);
+  auto* opbase_with_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(op.get());
+  PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr,
+                    platform::errors::InvalidArgument(
+                        "This op type:`%s` is not a OperatorWithKernel, only "
+                        "OperatorWithKernel can get KernelSignature",
+                        type));
+  return phi::KernelSignature(
+      std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx)));
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 73ecbbe6143ca8e68049c2d2886e9eee93b741f1..fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace imperative {
@@ -154,6 +155,10 @@ class Tracer {
     }
   }
 
+  phi::KernelSignature GetExpectedKernelSignature(
+      const std::string& type, const NameVarBaseMap& ins,
+      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 26b8b9e8e17e046964d648f564c26293036e4033..5d0c3c98d2f618eb1f3d41e6a4e2434e5cd80401 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -45,6 +45,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
+
+if(WITH_ONNXRUNTIME)
+  set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor)
+endif()
+
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
@@ -91,6 +96,13 @@ if (WITH_PSCORE)
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service)
 endif ()
 
+if (WITH_ONNXRUNTIME)
+  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} 
+      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc
+  )
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor)
+endif (WITH_ONNXRUNTIME)
+
 # Create shared inference library
 cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
     DEPS ${SHARED_INFERENCE_DEPS})
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 87efe5ec5190372b48f1bd6387e1c92f456865a1..bdc16ef4c7907764473c552461cde35f011ad489 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
-    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel)
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
 
 if(WITH_CRYPTO)
     list(APPEND paddle_inference_api_deps paddle_crypto)
@@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+if (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
+    cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor)
+else (WITH_ONNXRUNTIME)
+    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
+endif (WITH_ONNXRUNTIME)
+
 
 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
@@ -75,6 +82,16 @@ elseif (WIN32)
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if (NOT APPLE AND NOT WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  elseif (WIN32)
+    cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps}
+            ARGS --dirname=${MOBILENETV2_MODEL_DIR})
+  endif()
+endif()
+
 if(WITH_TESTING AND WITH_MKLDNN)
   if (NOT APPLE AND NOT WIN32)
     cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 9c33d7003064532db7276d0f6dad90e1b2c55104..41c01d3b7e261314d8dc6b852f5b2a597421fe48 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
   Update();
 }
 
+void AnalysisConfig::EnableONNXRuntime() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  use_onnxruntime_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()";
+  use_onnxruntime_ = false;
+#endif
+
+  Update();
+}
+
+void AnalysisConfig::DisableONNXRuntime() {
+  use_onnxruntime_ = false;
+  Update();
+}
+
+void AnalysisConfig::EnableORTOptimization() {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  enable_ort_optimization_ = true;
+#else
+  LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()";
+  enable_ort_optimization_ = false;
+#endif
+
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5492c3b0d26453c590e6a0a1350d88b442b789f7..871ed596a3ee9d6362b03e99ca10313765826a51 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -65,6 +65,10 @@
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #endif
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
@@ -80,6 +84,8 @@ using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
 #endif
 
+int AnalysisPredictor::clone_num_ = 1;
+
 namespace {
 bool IsPersistable(const framework::VarDesc *var) {
   if (var->Persistable() &&
@@ -1633,7 +1639,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
-  x->executor_->ResetTrtOps(++x->clone_num_);
+  x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
@@ -1760,6 +1766,27 @@ namespace paddle_infer {
 Predictor::Predictor(const Config &config) {
   const_cast<Config *>(&config)->SwitchUseFeedFetchOps(false);
   // The second parameter indicates that the discard log is not printed
+  if (config.use_onnxruntime()) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+    if (config.use_gpu()) {
+      LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU,"
+                      "and it falls back to use Paddle Inference.";
+    } else if (!paddle::CheckConvertToONNX(config)) {
+      LOG(WARNING)
+          << "Paddle2ONNX do't support convert the Model， fall back to using "
+             "Paddle Inference.";
+    } else {
+      predictor_ = paddle::CreatePaddlePredictor<
+          Config, paddle::PaddleEngineKind::kONNXRuntime>(config);
+      return;
+    }
+#else
+    LOG(WARNING)
+        << "The onnxruntime backend isn't enabled,"
+           " and please re-compile Paddle with WITH_ONNXRUNTIME option,"
+           "fall back to using Paddle Inference.";
+#endif
+  }
   predictor_ = paddle::CreatePaddlePredictor<
       Config, paddle::PaddleEngineKind::kAnalysis>(config);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 8ed183dae0b1b00f8e0014b2d9b470ac177152f0..21a7e9658bbeeb16d4cbff6364aaef68edcae16d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -486,7 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
-  int clone_num_{1};
+  static int clone_num_;
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 9c7e5c6b27e68ee10be5f8b56d6de4aea4524078..2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -357,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) {
 }
 #endif
 
+TEST(AnalysisPredictor, enable_onnxruntime) {
+  AnalysisConfig config;
+  config.EnableONNXRuntime();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.use_onnxruntime());
+#else
+  ASSERT_TRUE(!config.use_onnxruntime());
+#endif
+  config.EnableORTOptimization();
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  ASSERT_TRUE(config.ort_optimization_enabled());
+#else
+  ASSERT_TRUE(!config.ort_optimization_enabled());
+#endif
+  config.DisableONNXRuntime();
+  ASSERT_TRUE(!config.use_onnxruntime());
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -408,6 +426,14 @@ TEST(Predictor, Run) {
   predictor->TryShrinkMemory();
 }
 
+TEST(Predictor, EnableONNXRuntime) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  auto predictor = CreatePredictor(config);
+}
+
 TEST(Tensor, CpuShareExternalData) {
   Config config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index d03840ada36bce8cfdc2213284697e6d873cbde0..df98a7b05cf3f2035e9a21ec10e4b44eca843bbd 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -4,6 +4,7 @@ option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL.
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -151,6 +159,17 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -213,6 +232,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef5c08cd041eb7af4c7f17a95c4fd9b8601e4bad
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <vector>
+#include "gflags/gflags.h"
+#include "utils.h"  // NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+
+namespace paddle {
+namespace demo {
+
+/*
+ * Use the onnxruntime engine to inference the demo.
+ */
+void Main() {
+  paddle::AnalysisConfig config;
+  config.EnableONNXRuntime();
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  auto predictor = paddle_infer::CreatePredictor(config);
+
+  // Inference.
+  std::vector<int> input_shape = {1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(input_shape);
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  predictor->Run();
+  output_tensor->CopyToCpu(out_data.data());
+
+  VLOG(3) << "output.size " << out_data.size();
+}
+
+}  // namespace demo
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 5f062e8063253a08466b2491e80417af07047394..79a31555c7f0b1cb4a8d9c48bae16145d605935b 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 USE_TENSORRT=$5
 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr
-MSVC_STATIC_CRT=$7
+WITH_ONNXRUNTIME=$7
+MSVC_STATIC_CRT=$8
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
@@ -38,6 +39,26 @@ else
   use_gpu_list='false'
 fi
 
+mkdir -p $DATA_DIR
+cd $DATA_DIR
+
+if [ $7 == ON ]; then
+  ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB}
+  PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB}
+  #download model
+  mkdir -p MobileNetV2
+  cd MobileNetV2
+  if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
+    echo "MobileNetV2.inference.model.tar.gz has been downloaded."
+  else
+    wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    tar xzf *.tar.gz
+  fi
+  cd ..
+fi
+
 PREFIX=inference-vis-demos%2F
 URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
@@ -58,8 +79,7 @@ function download() {
   fi
   cd ..
 }
-mkdir -p $DATA_DIR
-cd $DATA_DIR
+
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
   download $vis_demo_name
@@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
@@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
       Release/trt_mobilenet_demo.exe \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
     if [ -d $word2vec_model ]; then
@@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
     make -j$(nproc)
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
@@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_GPU=$TEST_GPU_CPU \
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       make -j$(nproc)
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
@@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do
         exit 1
       fi
     fi
+
+    # --------onnxruntime mobilenetv2 on linux/mac------
+    if [ $WITH_ONNXRUNTIME == ON ]; then
+      rm -rf *
+      cmake .. -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=onnxruntime_mobilenet_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
+      make -j$(nproc)
+      ./onnxruntime_mobilenet_demo \
+        --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
+      if [ $? -ne 0 ]; then
+        echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail."
+        exit 1
+      fi
+    fi
   fi
 done
 set +x
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee82da139d8f39c26002763c4a4835050c48fc99
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -0,0 +1,354 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid//platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+framework::proto::VarType::Type ConvertONNXType(
+    ONNXTensorElementDataType type) {
+  switch (type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return framework::proto::VarType::FP32;
+    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+    //   return DataType::FP16;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+      return framework::proto::VarType::INT8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return framework::proto::VarType::INT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return framework::proto::VarType::INT64;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+      return framework::proto::VarType::UINT8;
+    default:
+      LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
+      return framework::proto::VarType::FP32;
+  }
+}
+
+bool CheckConvertToONNX(const AnalysisConfig &config) {
+  if (!config.model_dir().empty()) {
+    LOG(ERROR) << "Paddle2ONNX not support model_dir config";
+    // TODO(heliqi jiangjiajun): Paddle2ONNX not support
+    // config.model_dir() + "/__model__"
+    // config.model_dir() + var_name
+    return false;
+  } else if (config.prog_file().empty() || config.params_file().empty()) {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s' or params path '%s'.",
+        config.model_dir(), config.prog_file(), config.params_file());
+    return false;
+  }
+  return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
+                                   config.model_from_memory());
+}
+
+bool ONNXRuntimePredictor::Init() {
+  VLOG(3) << "ONNXRuntime Predictor::init()";
+
+  // Now ONNXRuntime only suuport CPU
+  if (config_.use_gpu()) {
+    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  scope_.reset(new paddle::framework::Scope());
+  sub_scope_ = &scope_->NewScope();
+
+  std::string onnx_proto;
+  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
+                      config_.model_from_memory());
+
+  Ort::SessionOptions session_options;
+  if (config_.ort_optimization_enabled()) {
+    session_options.SetGraphOptimizationLevel(
+        GraphOptimizationLevel::ORT_ENABLE_ALL);
+  }
+  // Turn optimization off first, and then turn it on when it's stable
+  // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+  // session_options.EnableCpuMemArena();
+  // session_options.EnableMemPattern();
+  // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads());
+  session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads());
+  VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads();
+  if (config_.profile_enabled()) {
+    LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the "
+                    "performance";
+#if defined(_WIN32)
+    session_options.EnableProfiling(L"ONNX");
+#else
+    session_options.EnableProfiling("ONNX");
+#endif
+  } else {
+    VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report "
+               "will be "
+               "generated.";
+  }
+  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+
+  auto memory_info =
+      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::Allocator allocator(session_, memory_info);
+
+  framework::proto::VarType::Type proto_type =
+      framework::proto::VarType::LOD_TENSOR;
+  size_t n_inputs = session_.GetInputCount();
+  for (size_t i = 0; i < n_inputs; ++i) {
+    auto input_name = session_.GetInputName(i, allocator);
+    auto type_info = session_.GetInputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
+    auto *ptr = scope_->Var(input_name);
+    framework::InitializeVariable(ptr, proto_type);
+    allocator.Free(input_name);
+  }
+
+  size_t n_outputs = session_.GetOutputCount();
+  for (size_t i = 0; i < n_outputs; ++i) {
+    auto output_name = session_.GetOutputName(i, allocator);
+    auto type_info = session_.GetOutputTypeInfo(i);
+    std::vector<int64_t> shape =
+        type_info.GetTensorTypeAndShapeInfo().GetShape();
+    ONNXTensorElementDataType data_type =
+        type_info.GetTensorTypeAndShapeInfo().GetElementType();
+    output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
+    auto *ptr = scope_->Var(output_name);
+    framework::InitializeVariable(ptr, proto_type);
+    allocator.Free(output_name);
+  }
+
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig &config) {
+  if (config.glog_info_disabled()) {
+    FLAGS_logtostderr = 1;
+    FLAGS_minloglevel = 2;  // GLOG_ERROR
+  }
+
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
+
+  VLOG(3) << "create ONNXRuntimePredictor";
+
+  std::unique_ptr<PaddlePredictor> predictor(new ONNXRuntimePredictor(config));
+  // Each config can only be used for one predictor.
+  config.SetInValid();
+  auto predictor_p = dynamic_cast<ONNXRuntimePredictor *>(predictor.get());
+
+  if (!predictor_p->Init()) {
+    return nullptr;
+  }
+
+  return predictor;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto input_desc : input_desc_) {
+    input_names.push_back(input_desc.name);
+  }
+  return input_names;
+}
+
+std::map<std::string, std::vector<int64_t>>
+ONNXRuntimePredictor::GetInputTensorShape() {
+  std::map<std::string, std::vector<int64_t>> input_shapes;
+  for (auto input_desc : input_desc_) {
+    input_shapes[input_desc.name] = input_desc.shape;
+  }
+  return input_shapes;
+}
+
+std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto output_desc : output_desc_) {
+    output_names.push_back(output_desc.name);
+  }
+  return output_names;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The in variable named %s is not found in the "
+                              "scope of the ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  return res;
+}
+
+std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The out variable named %s is not found in the "
+                              "scope of the ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = place_;
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+  return res;
+}
+
+Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
+                                             const char *device_name) {
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  size_t size =
+      tensor->numel() *
+      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
+  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
+  return Ort::Value::CreateTensor(memory_info,
+                                  static_cast<void *>(tensor->data()), size,
+                                  shape.data(), shape.size(), desc.dtype);
+}
+
+void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
+                                    const ONNXDesc &desc) {
+  auto info = value.GetTensorTypeAndShapeInfo();
+
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(phi::make_ddim(info.GetShape()));
+  auto dtype = ConvertONNXType(info.GetElementType());
+  auto *ptr = tensor->mutable_data(place_, dtype);
+
+  if (platform::is_cpu_place(place_)) {
+    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
+                tensor->numel() * framework::SizeOfType(dtype));
+  } else {
+    auto src_place = place_;
+    auto dst_place = place_;
+    memory::Copy(dst_place, ptr, src_place,
+                 const_cast<void *>(value.GetTensorData<void>()),
+                 tensor->numel() * framework::SizeOfType(dtype));
+  }
+}
+
+bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                               std::vector<PaddleTensor> *output_data,
+                               int batch_size) {
+  LOG(ERROR) << "Not support Run";
+  return false;
+}
+
+bool ONNXRuntimePredictor::ZeroCopyRun() {
+  try {
+    Ort::IoBinding binding(session_);
+    std::vector<Ort::Value> inputs;
+    std::vector<Ort::Value> outputs;
+    Ort::RunOptions options;
+
+    inputs.reserve(input_desc_.size());
+    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
+    for (auto desc : input_desc_) {
+      inputs.push_back(GetOrtValue(desc, device_name));
+      binding.BindInput(desc.name.c_str(), inputs.back());
+    }
+
+    // TODO(heliqi): Optimization —— move to  Init()
+    for (auto desc : output_desc_) {
+      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                                  place_.GetDeviceId(), OrtMemTypeDefault);
+      binding.BindOutput(desc.name.c_str(), memory_info);
+    }
+
+    session_.Run({}, binding);
+
+    outputs = binding.GetOutputValues();
+    for (size_t i = 0; i < output_desc_.size(); ++i) {
+      AsTensor(outputs[i], output_desc_[i]);
+    }
+  } catch (const std::exception &e) {
+    LOG(ERROR) << e.what();
+    return false;
+  }
+
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone() {
+  LOG(ERROR) << "Not support Clone(), Please create new Predictor";
+  return nullptr;
+}
+
+uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
+  return paddle::memory::Release(place_);
+}
+
+ONNXRuntimePredictor::~ONNXRuntimePredictor() {
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+  memory::Release(place_);
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fb07aa97bd2746773192456ddeba941a24e8906
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -0,0 +1,225 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_compatible_info.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#include "paddle2onnx/converter.h"
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
+#endif
+
+///
+/// \file onnxruntime_predictor.h
+///
+/// \brief A predictor using ONNXRuntime
+///
+/// \author heliqi@baidu.com
+/// \date 2022-02-14
+/// \since 2.3.0
+///
+
+namespace paddle {
+
+bool CheckConvertToONNX(const AnalysisConfig &config);
+
+struct ONNXDesc {
+  std::string name;
+  std::vector<int64_t> shape;
+  ONNXTensorElementDataType dtype;
+};
+
+///
+/// \class ONNXRuntimePredictor
+///
+/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePaddlePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output tensors
+/// \code{cpp}
+///   auto input_t = predictor->GetInputTensor(input_names[0]);
+///   auto output_t = predictor->GetOutputTensor(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->ZeroCopyRun();
+/// \endcode
+///
+class ONNXRuntimePredictor : public PaddlePredictor {
+ public:
+  ///
+  /// \brief Construct a new ONNXRuntime Predictor object
+  ///
+  /// \param[in] AnalysisConfig config
+  ///
+  explicit ONNXRuntimePredictor(const AnalysisConfig &config)
+      : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
+  }
+  ///
+  /// \brief Destroy the ONNXRuntime Predictor object
+  ///
+  ~ONNXRuntimePredictor();
+
+  ///
+  /// \brief Initialize predictor
+  ///
+  /// \return Whether the init function executed successfully
+  ///
+  bool Init();
+
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
+  std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
+  std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+  ///
+  /// \brief Get all input names and their corresponding shapes
+  ///
+  /// \return the map of input names and shapes
+  ///
+  std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
+
+  /// Not supoort
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
+
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
+  bool ZeroCopyRun() override;
+
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  std::shared_ptr<framework::Scope> scope_;
+
+ private:
+  ///
+  /// \brief get the Ort Value(input Tensor).
+  ///
+  /// \param[in] desc ONNXDesce(name、shape、dtype)
+  ///
+  /// \param[in] device_name "cpu" or "gpu" of device
+  ///
+  /// \return get a Ort::Value
+  ///
+  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
+
+  ///
+  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  ///
+  /// \param[in] value Ort::Value(output Tensor)
+  ///
+  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  ///
+  /// \return get a Ort::Value
+  ///
+  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+
+ private:
+  AnalysisConfig config_;
+
+  // ONNXRuntime
+  Ort::Env env_;
+  Ort::Session session_{nullptr};
+
+  platform::Place place_;
+  framework::Scope *sub_scope_{nullptr};
+  std::vector<ONNXDesc> input_desc_;
+  std::vector<ONNXDesc> output_desc_;
+  int predictor_id_;
+
+// Some more detailed tests, they are made the friends of the predictor, so that
+// the all the details can be tested.
+#if PADDLE_WITH_TESTING
+  FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on);
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2be2de9c60bb1c3fdedf13212d50a6f4e155d4df
--- /dev/null
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+DEFINE_string(dirname, "", "dirname to tests.");
+
+namespace paddle {
+
+TEST(ONNXRuntimePredictor, onnxruntime_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname + "/inference.pdmodel",
+                  FLAGS_dirname + "/inference.pdiparams");
+  config.EnableONNXRuntime();
+  config.EnableORTOptimization();
+  config.SetCpuMathLibraryNumThreads(2);
+  LOG(INFO) << config.Summary();
+
+  auto _predictor =
+      CreatePaddlePredictor<AnalysisConfig,
+                            paddle::PaddleEngineKind::kONNXRuntime>(config);
+  ASSERT_TRUE(_predictor);
+  auto* predictor = static_cast<ONNXRuntimePredictor*>(_predictor.get());
+
+  ASSERT_TRUE(predictor);
+  ASSERT_TRUE(!predictor->Clone());
+  ASSERT_TRUE(predictor->scope_);
+  ASSERT_TRUE(predictor->sub_scope_);
+  ASSERT_EQ(predictor->scope_->parent(), nullptr);
+  ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // Dummy Input Data
+  std::vector<int64_t> input_shape = {-1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1.0);
+  std::vector<float> out_data;
+  out_data.resize(1000);
+
+  // testing all interfaces
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto get_input_shape = predictor->GetInputTensorShape();
+
+  ASSERT_EQ(input_names.size(), 1UL);
+  ASSERT_EQ(output_names.size(), 1UL);
+  ASSERT_EQ(input_names[0], "inputs");
+  ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1");
+  ASSERT_EQ(get_input_shape["inputs"], input_shape);
+
+  auto input_tensor = predictor->GetInputTensor(input_names[0]);
+  input_tensor->Reshape({1, 3, 224, 224});
+  auto output_tensor = predictor->GetOutputTensor(output_names[0]);
+
+  input_tensor->CopyFromCpu(input_data.data());
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+  output_tensor->CopyToCpu(out_data.data());
+
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index b4a358394404fa7d28838a00c96290747f146a1f..7b765e3fa8a24ef1b81b68da8ba12dd8e5577572 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -319,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableNpu(int device_id = 0);
   ///
+  /// \brief Turn on ONNXRuntime.
+  ///
+  void EnableONNXRuntime();
+  ///
+  /// \brief Turn off ONNXRuntime.
+  ///
+  void DisableONNXRuntime();
+  ///
+  /// \brief Turn on ONNXRuntime Optimization.
+  ///
+  void EnableORTOptimization();
+  ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
   /// \return bool Whether the GPU is turned on.
@@ -342,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool use_ipu() const { return use_ipu_; }
   ///
+  /// \brief A boolean state telling whether the ONNXRuntime is turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime is turned on.
+  ///
+  bool use_onnxruntime() const { return use_onnxruntime_; }
+  ///
+  /// \brief A boolean state telling whether the ONNXRuntime Optimization is
+  /// turned on.
+  ///
+  /// \return bool Whether the ONNXRuntime Optimization is turned on.
+  ///
+  bool ort_optimization_enabled() const { return enable_ort_optimization_; }
+  ///
   /// \brief Get the GPU device id.
   ///
   /// \return int The GPU device id.
@@ -841,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_npu_{false};
   int npu_device_id_{0};
 
+  // ONNXRuntime related
+  bool use_onnxruntime_{false};
+  bool enable_ort_optimization_{false};
+
   // Padding related
   bool use_fc_padding_{true};
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index c129efe494b4fb36bc72d3c93e24951ba7fef322..657dd9b600cce7173e3aa8d0156ba0975199cf98 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
 
  private:
   friend class AnalysisPredictor;
+  friend class ONNXRuntimePredictor;
   explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {}
 };
 
@@ -381,6 +382,7 @@ enum class PaddleEngineKind {
   kNative = 0,         ///< Use the native Fluid facility.
   kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
   kAnalysis,           ///< More optimization.
+  kONNXRuntime,        ///< Use ONNXRuntime
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
@@ -395,6 +397,11 @@ template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
 
+template <>
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
+    const AnalysisConfig& config);
+
 PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);
 
 PD_INFER_DECL std::string get_version();
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f5f36d805b43ea0815683e3b65bf157fe5beb2de..22d9dedb32ebfcc229e0034cc5cf6092907dc8df 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -262,6 +262,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              //  "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
+             "elt_act_mkldnn_fuse_pass",              //
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
              // "mkldnn_inplace_pass",  // This pass should be activated after
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index e342190fda1aca53a6814806e1afec1335224d79..d7b07652babbd1e24e2c650ac8ac079f03523d12 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
   return config->use_gpu();
 }
 
+void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableONNXRuntime();
+}
+
+void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableONNXRuntime();
+}
+
+PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_onnxruntime();
+}
+
+void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableORTOptimization();
+}
+
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                         int32_t l3_workspace_size, PD_Bool locked,
                         PD_Bool autotune, const char* autotune_file,
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index c314aca918f141d30661d9034656899bbb816063..f6b754cad213f8d5249317468b5ceb21e863f6ad 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
     __pd_keep PD_Config* pd_config);
 ///
+/// \brief Turn on ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off ONNXRuntime.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the ONNXRutnime is turned on.
+///
+/// \return Whether the ONNXRuntime is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Turn on XPU.
 ///
 /// \param[in] pd_onfig config
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index def26913b0a1c082b3a983cea5fa8021c468b59c..8f9f34c06b4768317d6f710ac49a7610a9ef9d6a 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
 	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
 }
 
+///
+/// \brief Turn on ONNXRuntime.
+///
+func (config *Config) EnableONNXRuntime() {
+	C.PD_ConfigEnableONNXRuntime(config.c)
+}
+
+///
+/// \brief Turn off ONNXRuntime.
+///
+func (config *Config) DisableONNXRuntime() {
+	C.PD_ConfigDisableONNXRuntime(config.c)
+}
+
+///
+/// \brief A boolean state telling whether the ONNXRuntime is turned on.
+///
+/// \return bool Whether the ONNXRuntime is turned on.
+///
+func (config *Config) ONNXRuntimeEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c))
+}
+
+///
+/// \brief Turn on ONNXRuntime Optimization.
+///
+func (config *Config) EnableORTOptimization() {
+	C.PD_ConfigEnableORTOptimization(config.c)
+}
+
 ///
 /// \brief Turn on XPU.
 ///
diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go
index b82161880839e500a20b787914e2827da151106b..297841dcbcf6c19aef4a536557ec30e76ea9f82c 100644
--- a/paddle/fluid/inference/goapi/config_test.go
+++ b/paddle/fluid/inference/goapi/config_test.go
@@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) {
 
 	config.SetBfloat16Op([]string{"fc", "mul"})
 }
+
+func TestONNXRuntime(t *testing.T) {
+	config := NewConfig()
+	config.SetModelDir("modelDir")
+	t.Log(config.ModelDir())
+
+	config.EnableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.DisableONNXRuntime()
+	t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled())
+
+	config.EnableORTOptimization()
+
+	config.SetCpuMathLibraryNumThreads(4)
+	t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
+}
\ No newline at end of file
diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go
index 40e518304510c57fec9cd7609ecbd6eefa456050..755558f96238d11842f8245c2b36210c60d8a057 100644
--- a/paddle/fluid/inference/goapi/predictor_test.go
+++ b/paddle/fluid/inference/goapi/predictor_test.go
@@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) {
 	cloned.ClearIntermediateTensor()
 }
 
+func TestONNXRuntimePredictor(t *testing.T) {
+	t.Logf("Version:\n%+v", Version())
+	config := NewConfig()
+	config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams")
+	config.EnableONNXRuntime()
+	config.EnableORTOptimization()
+	predictor := NewPredictor(config)
+	inNames := predictor.GetInputNames()
+	t.Logf("InputNames:%+v", inNames)
+	outNames := predictor.GetOutputNames()
+	t.Logf("OutputNames:%+v", outNames)
+
+	inHandle := predictor.GetInputHandle(inNames[0])
+	inHandle.Reshape([]int32{1, 3, 224, 224})
+	t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape())
+
+	data := make([]float32, numElements([]int32{1, 3, 224, 224}))
+	for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
+		data[i] = float32(i%255) * 0.1
+	}
+	inHandle.CopyFromCpu(data)
+	t.Logf("inHandle Type:%+v", inHandle.Type())
+
+	predictor.Run()
+
+	outHandle := predictor.GetOutputHandle(outNames[0])
+	t.Logf("outHandle name:%+v", outHandle.Name())
+
+	outShape := outHandle.Shape()
+	t.Logf("outHandle Shape:%+v", outShape)
+	outData := make([]float32, numElements(outShape))
+	outHandle.CopyToCpu(outData)
+	t.Log(outData)
+}
+
+
 func TestFromBuffer(t *testing.T) {
 	modelFile, err := os.Open("./mobilenetv1/inference.pdmodel")
 	if err != nil {
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
index edccc2648c012fda9e22c2fc14ffe4f90dc26cfe..cff9fd4aa7ceada2a37d9650c9ce3653f0155447 100644
--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -22,6 +22,7 @@ fi
 
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
 
 # 3. go test
 go clean -testcache
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 8c61200f7f57cdf57b372c37c8f7cea40c4a8d4c..b69292827aa136fd1d8a1f66d80823e6344a6174 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index a432ff62810aa30c01c1980c80bf3f344039f7dd..f19b21d3e632633d7066c3e9e14cadd2900eb339 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -335,15 +335,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
         reshape_before_fc_dim.d[4] = 1;
         auto* reshape_before_fc_layer =
             TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        if (enable_int8) {
+          engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
+                                         in_scale);
+        }
         reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
         reshape_before_fc_layer->setName(
             ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
                 .c_str());
 
         // add layer fc
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
-            weight.get(), bias.get());
+        nvinfer1::ILayer* fc_layer = nullptr;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Convolution, *reshape_before_fc_layer->getOutput(0), n,
+              nv_ksize, weight.get(), bias.get());
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+              n, weight.get(), bias.get());
+        }
+
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("fc_out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+        }
         fc_layer->setName(
             ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
 
@@ -359,6 +381,10 @@ class MultiheadMatMulOpConverter : public OpConverter {
         plugin_inputs.push_back(input_bias_qk);
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+
+        if (enable_int8) {
+          with_fp16 = 1;
+        }
         plugin::DynamicPluginTensorRT* plugin =
             new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
                                                   head_size, scale, with_fp16);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index fe04d552e40263a396059e3da59de4d51def67e0..7b65d2d7c97cca335f76f1d0399a25bcd8a00c92 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
 REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
index b8e87a8d94d1f43d35da1a46c300a1b37c9382ec..5a306f622adbe7a298ab53daae1168ad50b402a9 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool3d);
+USE_OP_ITSELF(pool3d);
 REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index f2dc5ba1c7c2c832e0239f6a30760c354aaf4699..1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index 95916746d6fcb528d26a8f8bb39980b55c4f3704..b96992ef8514abe0f71dbf23d38abb626f6c4a5b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP(conv2d_transpose);
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index 474fd92071fb0795b868f0cd86591061cf8b6581..cf377396087637f115523ddc60a468e2a23d57d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index 1725888abc379bfa4ffbbc5cfc4cecd1872c7c18..f17e00de0eeb7c8f4d782f0a4eaecc2fc1df268b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) {
 }  // namespace paddle
 
 // USE_OP(leaky_relu);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index bded833505cd25352adc4123de415613d1fc926d..36f13262a73d703a6d9776855adbab3c44075aa7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b2764ca61c11219e5546867813157b7f05ee3ce8..d53a8923af6120adb460d95fc81820b6dfa03a60 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -54,6 +54,8 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
       return TRT_DT::kFLOAT;
     case FluidDT::VarType_Type_INT32:
       return TRT_DT::kINT32;
+    case FluidDT::VarType_Type_FP16:
+      return TRT_DT::kHALF;
     default:
       return TRT_DT::kINT32;
   }
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
index 861a9aa9d000bff9e6dcc673cc5c8d99c3a7a6ec..5596a89a083fe9ff177aa9abc769b8fa27105c1f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace inference {
@@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs,
   output_shape.insert(output_shape.begin(), batchSize);
 
   if (pool3d_type_ == Pool3DType::max) {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool3d_forward;
     pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, adaptive_, odatas[0], stream, pool_process);
   } else if (pool3d_type_ == Pool3DType::avg) {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool3d_forward;
     pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, adaptive_, odatas[0], stream, pool_process);
@@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   }
 
   if (pool3d_type_ == "max") {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool3d_forward;
     pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, adaptive_, output, stream, pool_process);
   } else if (pool3d_type_ == "avg") {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool3dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool3d_forward;
     pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, adaptive_, output, stream, pool_process);
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 6d711c26adc6ff8e49375d15f32322303f3ae6ef..9bfe98d759d8e29bc34b42fa667e5cda5f1493de 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace inference {
@@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
   output_shape.insert(output_shape.begin(), batchSize);
 
   if (pool_type_ == PoolType::max) {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, true, false, odatas[0], stream, pool_process);
   } else if (pool_type_ == PoolType::avg) {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
                    paddings_, exclusive_, adaptive_, odatas[0], stream,
@@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   }
 
   if (pool_type_ == "max") {
-    paddle::operators::math::MaxPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::MaxPool<float>, float>
+    phi::funcs::MaxPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    true, false, output, stream, pool_process);
   } else if (pool_type_ == "avg") {
-    paddle::operators::math::AvgPool<float> pool_process;
-    paddle::operators::math::Pool2dDirectCUDAFunctor<
-        paddle::operators::math::AvgPool<float>, float>
+    phi::funcs::AvgPool<float> pool_process;
+    phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
         pool2d_forward;
     pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
                    exclusive_, adaptive_, output, stream, pool_process);
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index 57177cfa8b421e1d79004bb1a7f738d98dc00f97..336005d883b0f523213060645e540c35a14e4e9c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -16,7 +16,6 @@
 #include <cassert>
 
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
index df0eb58c2bd587e69215602512cc51f19c97a978..a341ffd7a081c24500e3b061b0ce3510a2aaacbc 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -81,6 +81,18 @@ TEST(PD_Config, interface) {
   PD_ConfigSetBfloat16Op(config, 1, &ops_name);
 #endif
 
+  PD_ConfigEnableONNXRuntime(config);
+  bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config);
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  EXPECT_TRUE(onnxruntime_enabled);
+#else
+  EXPECT_FALSE(onnxruntime_enabled);
+#endif
+  PD_ConfigDisableONNXRuntime(config);
+  bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config);
+  EXPECT_FALSE(onnxruntime_disabled);
+  PD_ConfigEnableORTOptimization(config);
+
   PD_ConfigEnableMemoryOptim(config, true);
   bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
   EXPECT_TRUE(memory_enabled);
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 9d83f8ff8fdc4756450c0fe9ae4d7096d9afa76f..f376cbd4fb302b1d7a038d958465f24db653e220 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -5,6 +5,7 @@ option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 option(WITH_GTEST "Compile demo with GTEST"   OFF)
+option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
+if (WITH_ONNXRUNTIME)
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
+  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
+
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
+  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
+endif()
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
@@ -172,6 +180,16 @@ else()
   endif()
 endif()
 
+if (WITH_ONNXRUNTIME)
+  if(WIN32)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+  elseif(APPLE)
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+  else()
+    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+  endif()
+endif()
+
 if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
@@ -248,6 +266,14 @@ if(WIN32)
           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
     )
   endif()
+  if(WITH_ONNXRUNTIME)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+      ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
   if(NOT WITH_STATIC_LIB)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh
index dd4b64f28d739776ee750205d41b4dce35a97640..8123d3785003471fd5f63f24fbb1166913d7e571 100755
--- a/paddle/fluid/inference/tests/infer_ut/run.sh
+++ b/paddle/fluid/inference/tests/infer_ut/run.sh
@@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT
-MSVC_STATIC_CRT=$6
+WITH_ONNXRUNTIME=$6
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 EXIT_CODE=0 # init default exit code
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
@@ -144,7 +145,8 @@ function compile_test() {
              -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
              -DWITH_GTEST=ON \
              -DCMAKE_CXX_FLAGS='/std:c++17' \
-             -DCMAKE_BUILD_TYPE=Release
+             -DCMAKE_BUILD_TYPE=Release \
+             -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj
     else
         cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -154,7 +156,8 @@ function compile_test() {
                  -DWITH_STATIC_LIB=OFF \
                  -DUSE_TENSORRT=$USE_TENSORRT \
                  -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
-                 -DWITH_GTEST=ON
+                 -DWITH_GTEST=ON \
+                 -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
         make -j$(nproc)
     fi;
     cd -
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 05c468b798886ac135ed30bff75ce9400f1ca3a1..6b6c0cd22f03b902f08d7a79236b1091b9fe6677 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc
 endif()
 set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
 
+if(WITH_ONNXRUNTIME)
+  set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2")
+  if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz)
+    inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz")
+  endif()
+  set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2")
+endif()
+
 function (inference_base_test_build TARGET)
    set(options "")
    set(oneValueArgs "")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 6cd7d87332323f4bafd49b8b16254f9610405658..f296ce96d4e5f6dca5c4ad2668eea8508b37068f 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -17,7 +17,7 @@ if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
   nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
 
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
@@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut
 if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
+  if (WITH_GPU)
+    cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator)
+  endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4d0e485285146e5668793d29fd8effc789fcc339..61e292a922f0e98a958d4fe2f8fc7850bdf47e18 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -193,10 +193,10 @@ class AllocatorFacadePrivate {
         }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
         for (const auto& dev_type : device_types) {
           for (size_t dev_id = 0;
-               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
                ++dev_id) {
             InitNaiveBestFitCustomDeviceAllocator(
                 platform::CustomPlace(dev_type, dev_id));
@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
         InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
-          }
-        } else {
+        if (!FLAGS_use_stream_safe_cuda_allocator) {
           for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -240,10 +235,10 @@ class AllocatorFacadePrivate {
         }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
         for (const auto& dev_type : device_types) {
           for (size_t dev_id = 0;
-               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
                ++dev_id) {
             InitAutoGrowthCustomDeviceAllocator(
                 platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
     }
 
     CheckAllocThreadSafe();
+
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+      WrapCUDAGraphAllocator();
+    }
+#endif
   }
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
                                 allocation.get()));
     return stream_safe_cuda_allocation->GetOwningStream();
   }
-
-#ifdef PADDLE_WITH_CUDA
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
-    }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
-  }
-
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
-#endif
 #endif
 
  private:
@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
     platform::Place place_;
   };
 
-  const AllocatorMap& GetAllocatorMap() {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
-      auto id = platform::CUDAGraph::CapturingID();
-      auto iter = cuda_graph_allocator_map_.find(id);
-      PADDLE_ENFORCE_NE(
-          iter, cuda_graph_allocator_map_.end(),
-          platform::errors::PermissionDenied(
-              "No memory pool is prepared for CUDA Graph capturing."));
-      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
-      return iter->second->allocators_;
-    } else {
-      return allocators_;
-    }
-#else
-    return allocators_;
-#endif
-  }
+  const AllocatorMap& GetAllocatorMap() { return allocators_; }
 
   void InitNaiveBestFitCPUAllocator() {
     allocators_[platform::CPUPlace()] =
@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
   }
 
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    const std::shared_ptr<Allocator>& underlying_allocator =
-        cuda_allocators_[p][stream];
-    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
-        underlying_allocator, p, stream);
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<StreamSafeCUDAAllocator>(
+        allocator, p, stream,
+        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
   }
 
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
         retry_time, 0,
         platform::errors::InvalidArgument(
             "Retry time should be larger than 0, but got %d", retry_time));
-    std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
+#ifdef PADDLE_WITH_CUDA
+  void WrapCUDAGraphAllocator() {
+    for (auto& item : allocators_) {
+      auto& allocator = item.second;
+      allocator = CUDAGraphAllocator::Create(allocator);
+    }
+  }
+#endif
+
   static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
     for (auto& place_pair : allocators) {
       for (auto& stream_pair : place_pair.second) {
@@ -738,7 +698,7 @@ class AllocatorFacadePrivate {
     auto custom_allocator =
         std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
+        custom_allocator, phi::DeviceManager::GetMinChunkSize(p),
         allow_free_idle_chunk);
   }
 #endif
@@ -814,11 +774,10 @@ class AllocatorFacadePrivate {
     }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
     for (const auto& dev_type : device_types) {
       for (size_t dev_id = 0;
-           dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
-           dev_id++) {
+           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
         places.emplace_back(platform::CustomPlace(dev_type, dev_id));
       }
     }
@@ -865,10 +824,6 @@ class AllocatorFacadePrivate {
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
   std::shared_timed_mutex cuda_allocator_mutex_;
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
-      cuda_graph_allocator_map_;
-#endif
 #endif
   AllocatorStrategy strategy_;
   AllocatorMap allocators_;
@@ -887,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
 AllocatorFacade::~AllocatorFacade() {}
 
 AllocatorFacade& AllocatorFacade::Instance() {
-  static AllocatorFacade instance;
-  return instance;
+  static AllocatorFacade* instance = new AllocatorFacade;
+  return *instance;
+}
+
+AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    auto id = platform::CUDAGraph::CapturingID();
+    auto iter = cuda_graph_map_.find(id);
+    PADDLE_ENFORCE_NE(
+        iter, cuda_graph_map_.end(),
+        platform::errors::PermissionDenied(
+            "No memory pool is prepared for CUDA Graph capturing."));
+    VLOG(10) << "Choose CUDA Graph memory pool";
+    return iter->second.get();
+  }
+#endif
+  return m_;
 }
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
@@ -896,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-
+    AllocatorFacadePrivate* m = GetPrivate();
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
+    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
   }
 #endif
 
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 
 void* AllocatorFacade::GetBasePtr(
@@ -923,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
                         "GetBasePtr() is only implemented for CUDAPlace(), not "
                         "suppot place: %s",
                         allocation->place()));
-  return m_->GetBasePtr(allocation);
+  return GetPrivate()->GetBasePtr(allocation);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -931,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place, const gpuStream_t& stream) {
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-    return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
   }
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 #endif
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
     const platform::Place& place) {
-  return m_->GetAllocator(place, /* zero size */ 0);
+  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
 }
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
@@ -958,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place, size)->Allocate(size);
-    }
-#endif
-
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
+    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
+        GetPrivate()->GetDefaultStream(cuda_place)));
+    return Alloc(cuda_place, size, default_stream);
   }
 #endif
-
-  return m_->GetAllocator(place, size)->Allocate(size);
+  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_
-          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
-          ->Release(place);
-    }
-#endif
-
     platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, m_->GetDefaultStream(cuda_place));
+    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
   }
 #endif
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+  return GetPrivate()
+      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
 }
 
 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -1002,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
           "multi-stream 'AllocaShared' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<phi::Allocation>(Alloc(place, size, s));
-#else
-  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
-#endif
+  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
 }
 
-bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<phi::Allocation>& allocation,
-    const phi::Stream& stream) {
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
           "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'InSameStream' function. To enable it, you can enter"
+          "multi-stream 'Alloc' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
 
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  platform::CUDAPlace p(place.GetDeviceId());
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+    return GetPrivate()
+        ->GetAllocator(p, s, /* create_if_not_found = */ true)
+        ->Allocate(size);
+  } else {
+    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
   }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return s == GetStream(allocation);
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }
 
+bool AllocatorFacade::InSameStream(
+    const std::shared_ptr<phi::Allocation>& allocation,
+    const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
-                                     const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
           "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'Alloc' function. To enable it, you can enter"
+          "multi-stream 'InSameStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return s == GetStream(allocation);
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  platform::CUDAPlace p(place.GetDeviceId());
-  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
-        ->Allocate(size);
-  } else {
-    return m_->GetAllocator(p, size)->Allocate(size);
-  }
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
@@ -1076,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
           "multi-stream 'Release' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetAllocator(place, stream)->Release(place);
+  return GetPrivate()->GetAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -1096,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
           "'RecordStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  m_->RecordStream(allocation, stream);
+  GetPrivate()->RecordStream(allocation, stream);
 }
 
 const gpuStream_t& AllocatorFacade::GetStream(
@@ -1116,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
           "'GetStream' function. To enable it, you can enter"
           "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
           "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetStream(allocation);
+  return GetPrivate()->GetStream(allocation);
 }
 
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-  return m_->PrepareMemoryPoolForCUDAGraph(id);
+  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported when the "
+                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                        "FLAGS_allocator_strategy=\"%s\"",
+                        FLAGS_allocator_strategy));
+  auto& allocator = cuda_graph_map_[id];
+  PADDLE_ENFORCE_EQ(
+      allocator.get(), nullptr,
+      platform::errors::InvalidArgument(
+          "The memory pool of the CUDA Graph with ID %d have been prepared.",
+          id));
+  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
 
 void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-  return m_->RemoveMemoryPoolOfCUDAGraph(id);
+  auto iter = cuda_graph_map_.find(id);
+  PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find CUDA Graph with ID = %d", id));
+  cuda_graph_map_.erase(iter);
+  VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
 }
 #endif
 #endif
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 1722a06b01f1302c3bb1f98c99af0431ab62f955..9066bb284e28af197111b5d3ea129cc65b5fe914 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -49,6 +49,8 @@ class AllocatorFacade {
 
   static AllocatorFacade& Instance();
 
+  AllocatorFacadePrivate* GetPrivate() const;
+
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
@@ -73,13 +75,14 @@ class AllocatorFacade {
                                           size_t size,
                                           const phi::Stream& stream);
 
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      const phi::Stream& stream);
+
   bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                     const phi::Stream& stream);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
-  AllocationPtr Alloc(const platform::Place& place, size_t size,
-                      const gpuStream_t& stream);
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
@@ -96,6 +99,10 @@ class AllocatorFacade {
  private:
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+      cuda_graph_map_;
+#endif
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2f24d5aed1eb827b4857f5936a19b206a38c788
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <random>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+namespace {
+std::mutex ipc_mutex_;
+std::unordered_map<std::string, std::weak_ptr<void>> ipc_handle_to_baseptr_;
+}  // namespace
+
+std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
+  std::lock_guard<std::mutex> lock(ipc_mutex_);
+
+  auto iter = ipc_handle_to_baseptr_.find(handle);
+  if (iter != ipc_handle_to_baseptr_.end()) {
+    auto baseptr = iter->second.lock();
+    if (baseptr) return baseptr;
+  }
+  // The IpcMemHandle can only open once for the same handle,
+  // so here we cache it here.
+  void *baseptr = nullptr;
+  auto ipc_handle =
+      reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
+      &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  // Close ipc handle on the same device.
+  int device_id = platform::GetCurrentDeviceId();
+  // Add deleter to close ipc handle.
+  auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
+    platform::CUDADeviceGuard guard(device_id);
+    std::lock_guard<std::mutex> lock(ipc_mutex_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    ipc_handle_to_baseptr_.erase(handle);
+    VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
+            << "\t" << ptr;
+  });
+  std::weak_ptr<void> wp = sp;
+  ipc_handle_to_baseptr_.insert(iter, {handle, wp});
+
+  return sp;
+}
+
+CudaIpcAllocation::~CudaIpcAllocation() {
+  shared_ptr_.reset();
+  VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
+          << "\t" << this->ptr();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..52e3cf10ea73a787d87d19beeedcdedca1e3dd3b
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _WIN32
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+std::shared_ptr<void> GetIpcBasePtr(std::string handle);
+
+class CudaIpcAllocation : public Allocation {
+ public:
+  explicit CudaIpcAllocation(void *ptr, size_t size, int device_id,
+                             std::shared_ptr<void> shared_ptr)
+      : Allocation(ptr, size, platform::CUDAPlace(device_id)),
+        device_id_(std::move(device_id)),
+        shared_ptr_(std::move(shared_ptr)) {}
+
+  inline const int &device_id() const { return device_id_; }
+
+  ~CudaIpcAllocation() override;
+
+ private:
+  int device_id_;
+  std::shared_ptr<void> shared_ptr_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index bd52c8f4ad270f0f70a23ab39b78bd9363ede769..e53d7b1cc766a3e277ef0a773671ef678bcb3ac7 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -32,17 +32,16 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) {
 }
 
 phi::Allocation* CustomAllocator::AllocateImpl(size_t size) {
-  std::call_once(once_flag_,
-                 [this] { platform::DeviceManager::SetDevice(place_); });
+  std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); });
 
   void* ptr =
-      platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
+      phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
   if (LIKELY(ptr)) {
     return new Allocation(ptr, size, place_);
   }
 
   size_t avail, total;
-  platform::DeviceManager::MemoryStats(place_, &total, &avail);
+  phi::DeviceManager::MemoryStats(place_, &total, &avail);
 
   auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
   auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index acaf5d548555cc3ee69bc5a03309645006256487..25c2235cce85369babc4d601de96c7475a0b1fbd 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -29,6 +29,155 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::string GetIPCName() {
+  static std::random_device rd;
+  std::string handle = "/paddle_";
+#ifdef _WIN32
+  handle += std::to_string(GetCurrentProcessId());
+#else
+  handle += std::to_string(getpid());
+#endif
+  handle += "_";
+  handle += std::to_string(rd());
+  return handle;
+}
+
+struct CountInfo {
+  std::atomic<int> refcount;
+};
+
+void AllocateMemoryMap(std::string filename, int flags, size_t size,
+                       void **map_ptr_, int *fd_) {
+  // TODO(@ZHUI): support win32
+  int file_flags = 0;
+  int fd = -1;
+  if (flags & MAPPED_SHAREDMEM) {
+    file_flags = O_RDWR | O_CREAT;
+  } else {
+    file_flags = O_RDONLY;
+  }
+  if (flags & MAPPED_EXCLUSIVE) {
+    file_flags |= O_EXCL;
+  }
+  if (flags & MAPPED_NOCREATE) {
+    file_flags &= ~O_CREAT;
+  }
+
+  if (!(flags & MAPPED_FROMFD)) {
+    if (flags & MAPPED_SHAREDMEM) {
+      fd = shm_open(filename.c_str(), file_flags, (mode_t)0600);
+      PADDLE_ENFORCE_NE(
+          fd, -1,
+          platform::errors::Unavailable(
+              "File descriptor %s open failed, unable in read-write mode",
+              filename.c_str()));
+      VLOG(6) << "shm_open: " << filename;
+    }
+  } else {
+    fd = -1;
+  }
+
+  PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
+                    platform::errors::Unavailable(
+                        "Fruncate a file to a specified length failed!"));
+
+  if (flags & MAPPED_SHAREDMEM) {
+    *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  } else {
+    *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+  }
+
+  PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED,
+                    platform::errors::Unavailable(
+                        "Memory map failed when create shared memory."));
+
+  if (flags & MAPPED_KEEPFD) {
+    *fd_ = fd;
+  } else {
+    PADDLE_ENFORCE_NE(::close(fd), -1,
+                      platform::errors::Unavailable(
+                          "Error closing memory maped file <", filename, ">"));
+
+    *fd_ = -1;
+  }
+}
+
+std::shared_ptr<RefcountedMemoryMapAllocation>
+AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
+                                      size_t size) {
+  int fd = -1;
+  void *base_ptr = nullptr;
+  AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
+  void *aliged_base_ptr =
+      static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
+  return std::make_shared<RefcountedMemoryMapAllocation>(aliged_base_ptr, size,
+                                                         filename, flags, fd);
+}
+
+RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
+    void *ptr, size_t size, std::string ipc_name, int fd, int flags)
+    : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
+  // must reset base ptr first.
+  resetBaseptr();
+  initializeRefercount();
+}
+
+void MemoryMapAllocation::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+}
+
+MemoryMapAllocation::~MemoryMapAllocation() { close(); }
+
+void RefcountedMemoryMapAllocation::incref() {
+  CountInfo *info = static_cast<CountInfo *>(map_ptr_);
+  ++info->refcount;
+}
+
+int RefcountedMemoryMapAllocation::decref() {
+  CountInfo *info = static_cast<CountInfo *>(map_ptr_);
+  return --info->refcount == 0;
+}
+
+void RefcountedMemoryMapAllocation::resetBaseptr() {
+  map_ptr_ =
+      static_cast<void *>(static_cast<char *>(map_ptr_) - mmap_alignment);
+  map_size_ = map_size_ + mmap_alignment;
+}
+
+void RefcountedMemoryMapAllocation::initializeRefercount() {
+  CountInfo *info = reinterpret_cast<CountInfo *>(map_ptr_);
+
+  if (flags_ & MAPPED_EXCLUSIVE) {
+    new (&info->refcount) std::atomic<int>(1);
+  } else {
+    info->refcount++;
+  }
+}
+
+void RefcountedMemoryMapAllocation::close() {
+  if (closed_) {
+    return;
+  }
+  closed_ = true;
+  void *data = map_ptr_;
+  CountInfo *info = reinterpret_cast<CountInfo *>(data);
+  if (--info->refcount == 0) {
+    PADDLE_ENFORCE_NE(
+        shm_unlink(ipc_name_.c_str()), -1,
+        platform::errors::Unavailable(
+            "could not unlink the shared memory file ", ipc_name_));
+    VLOG(6) << "shm_unlink file: " << ipc_name_;
+  }
+
+  PADDLE_ENFORCE_NE(
+      munmap(map_ptr_, map_size_), -1,
+      platform::errors::Unavailable("could not unmap the shared memory file: ",
+                                    strerror(errno), " (", errno, ")"));
+}
+
 MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
   PADDLE_ENFORCE_NE(
       munmap(this->ptr(), this->size()), -1,
@@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() {
   /* Here we do not pay attention to the result of shm_unlink,
      because the memory mapped file may have been cleared due to the
      MemoryMapFdSet::Clear() */
+
+  // Code of DataLoader subprocess:
+  //
+  //    core._array_to_share_memory_tensor(b)
+  //    out_queue.put((idx, tensor_list, structure))
+  //    core._remove_tensor_list_mmap_fds(tensor_list)
+
+  /* If the tensor in already in the send queue, the tensor will be
+   * deconstructed by the function. If the tensor not send yet, it
+   * will be cleared by MemoryMapFdSet::Clear().
+   * If the `_remove_tensor_list_mmap_fds` have be interrupted, the
+   * tensor will be cleared by both methods.
+   * */
+
   shm_unlink(this->ipc_name().c_str());
   MemoryMapFdSet::Instance().Remove(this->ipc_name());
   VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name();
 }
 
-std::string GetIPCName() {
-  static std::random_device rd;
-  std::string handle = "/paddle_";
-#ifdef _WIN32
-  handle += std::to_string(GetCurrentProcessId());
-#else
-  handle += std::to_string(getpid());
-#endif
-  handle += "_";
-  handle += std::to_string(rd());
-  return handle;
-}
-
 std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
     size_t size) {
   const std::string &ipc_name = GetIPCName();
   int flags = O_RDWR | O_CREAT;
-
-  int fd = shm_open(ipc_name.c_str(), flags, 0644);
+  int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(
       fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
                                             ipc_name.c_str()));
@@ -86,12 +235,14 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
 
 std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
     const std::string &ipc_name, size_t size) {
-  int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644);
+  int flags = O_RDWR | O_CREAT;
+  flags &= ~O_CREAT;
+
+  int fd = shm_open(ipc_name.c_str(), flags, 0600);
   PADDLE_ENFORCE_NE(
       fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
                                             ipc_name.c_str()));
-
-  void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+  void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
                     platform::errors::Unavailable(
                         "Memory map failed when rebuild shared memory."));
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h
index 3f91e5c42780826ae0ef2e61e982da2336d10a3f..4f8dbfbb51e66db227dfcf46bc3ce313d8406dd1 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.h
+++ b/paddle/fluid/memory/allocation/mmap_allocator.h
@@ -16,8 +16,9 @@
 
 #ifndef _WIN32
 
+#include <atomic>
 #include <memory>
-#include <mutex>  // NOLINT
+#include <mutex>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -28,6 +29,72 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::string GetIPCName();
+
+static constexpr int64_t mmap_alignment = 64;
+
+enum MappedModes {
+  MAPPED_SHAREDMEM = 1,
+  MAPPED_EXCLUSIVE = 2,
+  MAPPED_NOCREATE = 4,
+  MAPPED_KEEPFD = 8,
+  MAPPED_FROMFD = 16,
+  MAPPED_UNLINK = 32
+};
+
+class MemoryMapAllocation : public Allocation {
+ public:
+  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)),
+        map_ptr_(ptr),
+        map_size_(size) {}
+  explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
+                               int flags, int fd)
+      : Allocation(ptr, size, platform::CPUPlace()),
+        ipc_name_(std::move(ipc_name)),
+        fd_(fd),
+        flags_(flags),
+        map_ptr_(ptr),
+        map_size_(size) {}
+
+  inline const std::string &ipc_name() const { return ipc_name_; }
+
+  virtual void close();
+
+  ~MemoryMapAllocation() override;
+
+ protected:
+  std::string ipc_name_;
+  int fd_ = -1;
+  int flags_ = 0;
+  void *map_ptr_ = nullptr;
+  size_t map_size_ = 0;
+  bool closed_ = false;
+};
+
+class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
+ public:
+  RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
+                                int flags, int fd);
+
+  void incref();
+  int decref();
+  void close() override;
+  virtual ~RefcountedMemoryMapAllocation() { close(); }
+
+ protected:
+  void initializeRefercount();
+  void resetBaseptr();
+};
+
+void AllocateMemoryMap(std::string filename, int flags, size_t size,
+                       void **base_ptr_, int *fd_);
+
+std::shared_ptr<RefcountedMemoryMapAllocation>
+AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
+                                      size_t size);
+
 class MemoryMapWriterAllocation : public Allocation {
  public:
   explicit MemoryMapWriterAllocation(void *ptr, size_t size,
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index ea6d7019be6c1caf4844469276f3113525b33dfc..0bfbe2c6962294fc7e4aa2fff079e9cf411f26f8 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -739,7 +739,7 @@ class BuddyAllocatorList {
  private:
   explicit BuddyAllocatorList(const std::string &device_type)
       : device_type_(device_type) {
-    auto devices = platform::DeviceManager::GetDeviceList(device_type);
+    auto devices = phi::DeviceManager::GetDeviceList(device_type);
     for (auto dev_id : devices) {
       init_flags_[dev_id].reset(new std::once_flag());
     }
@@ -766,15 +766,15 @@ class BuddyAllocatorList {
                           device_type_, dev_id));
 
     std::call_once(*init_flags_[dev_id], [this, dev_id] {
-      platform::DeviceManager::SetDevice(device_type_, dev_id);
+      phi::DeviceManager::SetDevice(device_type_, dev_id);
       platform::CustomPlace place(device_type_, dev_id);
 
       allocators_[dev_id].reset(new BuddyAllocator(
           std::unique_ptr<detail::SystemAllocator>(
               new detail::CustomAllocator(device_type_, dev_id)),
-          platform::DeviceManager::GetMinChunkSize(place),
-          platform::DeviceManager::GetMaxChunkSize(place),
-          platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetMinChunkSize(place),
+          phi::DeviceManager::GetMaxChunkSize(place),
+          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
     });
 
     return allocators_[dev_id].get();
@@ -808,9 +808,9 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
   auto *ptr = buddy_allocator->Alloc(size);
 
   if (ptr == nullptr) {
-    platform::DeviceGuard guard(place);
+    phi::DeviceGuard guard(place);
     size_t avail, total;
-    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    phi::DeviceManager::MemoryStats(place, &total, &avail);
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
         "%s. ",
@@ -819,8 +819,7 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
         string::HumanReadableSize(total - avail)));
   } else {
     if (FLAGS_init_allocated_mem) {
-      platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
-                                                                    size);
+      phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size);
     }
   }
   VLOG(10) << "  pointer=" << ptr;
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 8627e3e6f8811e162ce3014c01145f331a03ee4b..072c4dee3bc45b4ff5f23f5288d3412a14f63b0f 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -15,56 +15,52 @@
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream,
+    StreamSafeCUDAAllocator* allocator)
     : Allocation(underlying_allocation->ptr(),
                  underlying_allocation->base_ptr(),
                  underlying_allocation->size(), underlying_allocation->place()),
       underlying_allocation_(std::move(underlying_allocation)),
-      owning_stream_(std::move(owning_stream)) {}
+      owning_stream_(std::move(owning_stream)),
+      allocator_(allocator->shared_from_this()) {}
 
 void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
   VLOG(8) << "Try record stream " << stream << " for address " << ptr();
   if (stream == owning_stream_) {
-    VLOG(9) << "Record the same stream of " << stream;
     return;
   }
 
   std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
-  gpuEvent_t record_event;
-  auto it = outstanding_event_map_.find(stream);
-  if (it == outstanding_event_map_.end()) {
-    gpuEvent_t new_event;
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
-#endif
-    outstanding_event_map_[stream] = new_event;
-    record_event = new_event;
-    VLOG(9) << "Create a new event " << new_event;
-  } else {
-    record_event = it->second;
-    VLOG(9) << "Reuse event " << record_event;
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    graph_capturing_stream_set_.insert(stream);
+    return;
   }
-
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
-  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+
+  RecordStreamWithNoGraphCapturing(stream);
+  RecordGraphCapturingStreams();
 }
 
 bool StreamSafeCUDAAllocation::CanBeFreed() {
-  // NOTE(Ruibiao): This function will not execute concurrently,
-  // so outstanding_event_lock_ is not required here
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    return graph_capturing_stream_set_.empty() &&
+           outstanding_event_map_.empty();
+  }
+#endif
+
+  RecordGraphCapturingStreams();
+
   for (auto it = outstanding_event_map_.begin();
        it != outstanding_event_map_.end(); ++it) {
     gpuEvent_t& event = it->second;
@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
   return owning_stream_;
 }
 
+void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() {
+  for (gpuStream_t stream : graph_capturing_stream_set_) {
+    RecordStreamWithNoGraphCapturing(stream);
+  }
+  graph_capturing_stream_set_.clear();
+}
+
+void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
+    const gpuStream_t& stream) {
+  gpuEvent_t record_event;
+  auto it = outstanding_event_map_.find(stream);
+  if (it == outstanding_event_map_.end()) {
+    gpuEvent_t new_event;
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
+#endif
+    outstanding_event_map_[stream] = new_event;
+    record_event = new_event;
+    VLOG(9) << "Create a new event " << new_event;
+  } else {
+    record_event = it->second;
+    VLOG(9) << "Reuse event " << record_event;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
+#endif
+  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+}
+
 StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
     std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
-    gpuStream_t default_stream)
+    gpuStream_t default_stream, bool in_cuda_graph_capturing)
     : underlying_allocator_(std::move(underlying_allocator)),
       place_(std::move(place)),
-      default_stream_(std::move(default_stream)) {
-  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  allocator_map_[place].emplace_back(this);
+      default_stream_(std::move(default_stream)),
+      in_cuda_graph_capturing_(in_cuda_graph_capturing) {
+  if (LIKELY(!in_cuda_graph_capturing)) {
+    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+    allocator_map_[place].emplace_back(this);
+  }
 }
 
 StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
-  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
-  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
-                   allocators.end());
+  if (LIKELY(!in_cuda_graph_capturing_)) {
+    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+    std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
+    allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
+                     allocators.end());
+  }
 }
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   }
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
       static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
-      default_stream_);
+      default_stream_, this);
   VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
           << allocation->ptr();
   return allocation;
@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
                               "StreamSafeCUDAAllocation*",
                               allocation));
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
-  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
     delete stream_safe_cuda_allocation;
   } else {
     VLOG(9) << "Put into unfreed_allocation list";
+    std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
     unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
   }
 }
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  if (UNLIKELY(in_cuda_graph_capturing_)) {
+    VLOG(7) << "Memory release forbidden in CUDA Graph Captruing";
+    return 0;
+  }
+
   std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
   std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
   uint64_t released_size = 0;
   for (StreamSafeCUDAAllocator* allocator : allocators) {
-    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
+    released_size += allocator->ProcessUnfreedAllocationsAndRelease();
   }
   VLOG(8) << "Release " << released_size << " bytes memory from all streams";
   return released_size;
@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
   }
 }
 
-uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
+uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() {
   ProcessUnfreedAllocations();
   return underlying_allocator_->Release(place_);
 }
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 7354836308cfba0338fb2e146cc14182006876ee..ecddff97c206be968148e32ddf3f9c6623bf8bde 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <deque>
 #include <list>
 #include <map>
-#include <mutex>
+#include <set>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,27 +31,38 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class StreamSafeCUDAAllocator;
+
 class StreamSafeCUDAAllocation : public Allocation {
  public:
   StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
-                           gpuStream_t owning_stream);
+                           gpuStream_t owning_stream,
+                           StreamSafeCUDAAllocator *allocator);
+
   void RecordStream(const gpuStream_t &stream);
   bool CanBeFreed();
-
   const gpuStream_t &GetOwningStream() const;
 
  private:
+  void RecordGraphCapturingStreams();
+  void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream);
   DecoratedAllocationPtr underlying_allocation_;
+  std::set<gpuStream_t> graph_capturing_stream_set_;
   std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
   gpuStream_t owning_stream_;
   SpinLock outstanding_event_map_lock_;
+  // To compatiable with CUDA Graph, hold the allocator shared_ptr so that
+  // Allocator will not deconstruct before Allocation
+  std::shared_ptr<Allocator> allocator_;
 };
 
-class StreamSafeCUDAAllocator : public Allocator {
+class StreamSafeCUDAAllocator
+    : public Allocator,
+      public std::enable_shared_from_this<StreamSafeCUDAAllocator> {
  public:
   StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
-                          platform::CUDAPlace place,
-                          gpuStream_t default_stream);
+                          platform::CUDAPlace place, gpuStream_t default_stream,
+                          bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
   bool IsAllocThreadSafe() const override;
 
@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
 
  private:
   void ProcessUnfreedAllocations();
-  uint64_t ProcessUnfreedAllocationsWithRelease();
+  uint64_t ProcessUnfreedAllocationsAndRelease();
 
   static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
       allocator_map_;
@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
   gpuStream_t default_stream_;
   std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
   SpinLock unfreed_allocation_lock_;
+
+  bool in_cuda_graph_capturing_;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29..076a96139612168f6c3d5d039184ccdb7a536f2e 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -26,6 +26,7 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
@@ -43,11 +44,11 @@ BuddyAllocator::BuddyAllocator(
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (!dev_type.empty()) {
     init_allocate_size_func_ = [dev_type]() {
-      return platform::DeviceManager::GetInitAllocSize(
+      return phi::DeviceManager::GetInitAllocSize(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
     re_allocate_size_func_ = [dev_type]() {
-      return platform::DeviceManager::GetReallocSize(
+      return phi::DeviceManager::GetReallocSize(
           platform::PlaceHelper::CreatePlace(dev_type));
     };
   } else {
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index a61f98c4e1a22adcc3684a9e5af190a82e3b5110..37ac0b4483291c8c3a3eeb31883c55c7eda24dc8 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -438,7 +438,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
   void* p;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
-  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   p = device->MemoryAllocate(size);
   if (LIKELY(p)) {
     VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
@@ -447,7 +447,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
   } else {
     size_t avail, total;
 
-    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    phi::DeviceManager::MemoryStats(place, &total, &avail);
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on %s %d. "
         "total memory is %s, used memory is %s, "
@@ -470,7 +470,7 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) {
                         size, plug_alloc_size));
   plug_alloc_size -= size;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
-  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   device->MemoryDeallocate(p, size);
 }
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index b60bb4fc1d1bb5e4366625277db8fdb968474891..2bca2c388a05958fda0e891190dcf7e7ddc53b0c 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                              stream);
 }
 
+AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                    const phi::Stream& stream) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
+}
+
 bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                   const phi::Stream& stream) {
   return allocation::AllocatorFacade::Instance().InSameStream(allocation,
@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                    const gpuStream_t& stream) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
-}
-
 uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 89b4caa5bed26fa9b8d0bf09df702f17a310dff6..601fe3f2a42c391c602887bacccae97125b951e1 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size,
                                                const phi::Stream& stream);
 
+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const phi::Stream& stream);
+
 extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                          const phi::Stream& stream);
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                           const gpuStream_t& stream);
-
 extern uint64_t Release(const platform::CUDAPlace& place,
                         const gpuStream_t& stream);
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 166cdd0b5d6b6a523cfe470662951184ebbfabc5..3198b4f8d935e3815ba94db945a24ab4df4ca97b 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -44,9 +44,9 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << ", stream=" << stream;
 
-  platform::DeviceManager::SetDevice(src_place);
-  platform::stream::Stream stream_wrapper(src_place, stream);
-  platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
+  phi::DeviceManager::SetDevice(src_place);
+  phi::stream::Stream stream_wrapper(src_place, stream);
+  phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
       dst, src, num, &stream_wrapper);
 }
 
@@ -62,9 +62,9 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << ", stream=" << stream;
 
-  platform::DeviceManager::SetDevice(dst_place);
-  platform::stream::Stream stream_wrapper(dst_place, stream);
-  platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
+  phi::DeviceManager::SetDevice(dst_place);
+  phi::stream::Stream stream_wrapper(dst_place, stream);
+  phi::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
       dst, src, num, &stream_wrapper);
 }
 
@@ -82,16 +82,16 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
           << dst_place << ", stream=" << stream;
 
   if (src_type == dst_type) {
-    platform::DeviceManager::SetDevice(src_place);
-    platform::stream::Stream stream_wrapper(src_place, stream);
+    phi::DeviceManager::SetDevice(src_place);
+    phi::stream::Stream stream_wrapper(src_place, stream);
 
     auto src_id = platform::PlaceHelper::GetDeviceId(src_place);
     auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place);
     if (src_id == dst_id) {
-      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
+      phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
           dst, src, num, &stream_wrapper);
     } else {
-      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
+      phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
           dst_place, dst, src, num, &stream_wrapper);
     }
   } else {
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 933717f3090c4b25f912e0bbe87922a1494c128a..5e4a4234bb41663f2287203fa9123029e6894036 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -12,34 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/stream.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 namespace paddle {
 namespace memory {
 
-__global__ void add_kernel(int *x, int n) {
+// y += (x + 1)
+__global__ void add_kernel(int *x, int *y, int n) {
   int thread_num = gridDim.x * blockDim.x;
   int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = thread_id; i < n; i += thread_num) {
-    atomicAdd(x + i, thread_id);
+    y[i] += x[i] + 1;
   }
 }
 
@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
                                  << " there may be a memory leak problem";
 }
 
-class StreamSafeCUDAAllocTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    place_ = platform::CUDAPlace();
-    stream_num_ = 64;
-    grid_num_ = 1;
-    block_num_ = 32;
-    data_num_ = 131072;
-    workspace_size_ = data_num_ * sizeof(int);
-
-    // alloc workspace for each stream
-    for (size_t i = 0; i < stream_num_; ++i) {
-      gpuStream_t stream;
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
-#endif
-
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, workspace_size_,
-                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaMemset(allocation->ptr(), 0, allocation->size()));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          hipMemset(allocation->ptr(), 0, allocation->size()));
-#endif
-
-      streams_.emplace_back(stream);
-      workspaces_.emplace_back(allocation);
-    }
-
-    result_ = Alloc(place_, stream_num_ * workspace_size_);
-  }
-
-  void SingleStreamRun(size_t idx) {
-    // for all stream i,
-    // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
-    for (size_t i = 0; i < stream_num_; ++i) {
-      int *x = reinterpret_cast<int *>(workspaces_[i]->ptr());
-      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
-      RecordStream(workspaces_[i], streams_[idx]);
-    }
-  }
-
-  void CopyResultAsync() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice));
-#endif
-    }
-  }
-
-  void MultiStreamRun() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-      SingleStreamRun(i);
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void MultiThreadMUltiStreamRun() {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads.push_back(
-          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
-    }
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads[i].join();
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void CheckResult() {
-    auto result_host = std::unique_ptr<int[]>(new int[result_->size()]);
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(),
-                                          result_->size(),
-                                          cudaMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(),
-                                         result_->size(),
-                                         hipMemcpyDeviceToHost));
-#endif
-    size_t thread_num = grid_num_ * block_num_;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      for (size_t j = 0; j < data_num_; ++j) {
-        EXPECT_TRUE(result_host[i * stream_num_ + j] ==
-                    (j % thread_num) * stream_num_);
-      }
-    }
-    result_.reset();
-  }
-
-  void TearDown() override {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#endif
-    for (gpuStream_t stream : streams_) {
-      Release(place_, stream);
-    }
-
-    for (size_t i = 1; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
-#endif
-    }
-
-    CheckMemLeak(place_);
-  }
-
-  size_t stream_num_;
-  size_t grid_num_;
-  size_t block_num_;
-  size_t data_num_;
-  size_t workspace_size_;
-  platform::CUDAPlace place_;
-  std::vector<gpuStream_t> streams_;
-  std::vector<std::shared_ptr<Allocation>> workspaces_;
-  allocation::AllocationPtr result_;
-};
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
-  MultiStreamRun();
-  CheckResult();
-}
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
-  MultiThreadMUltiStreamRun();
-  CheckResult();
-}
-
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
           paddle::platform::DeviceContextPool::Instance().Get(place))
           ->stream();
   allocation::AllocationPtr allocation_unique =
-      Alloc(place, alloc_size, default_stream);
+      Alloc(place, alloc_size,
+            phi::Stream(reinterpret_cast<phi::StreamId>(default_stream)));
   EXPECT_GE(allocation_unique->size(), alloc_size);
   EXPECT_EQ(allocation_unique->ptr(), address);
   allocation_unique.reset();
@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
   CheckMemLeak(place);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
-  platform::CUDAPlace place = platform::CUDAPlace();
-  size_t alloc_size = 1;
-  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
-
-  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
-  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(
-      AllocShared(place, alloc_size,
-                  phi::Stream(reinterpret_cast<phi::StreamId>(nullptr))),
-      paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(RecordStream(allocation, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet);
-  platform::EndCUDAGraphCapture();
-
-  allocation.reset();
-  Release(place);
-  CheckMemLeak(place);
-}
-#endif
-
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation1 = Alloc(
+      place, alloc_size, phi::Stream(reinterpret_cast<phi::StreamId>(stream1)));
   allocation::AllocationPtr allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = Alloc(place, alloc_size, stream2);
+    allocation2 = Alloc(place, alloc_size,
+                        phi::Stream(reinterpret_cast<phi::StreamId>(stream2)));
   });
   allocation1.reset();  // free but not release
   th.join();
@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   CheckMemLeak(place);
 }
 
+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 32;
+    data_num_ = 131072;
+    workspace_size_ = data_num_ * sizeof(int);
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
+#endif
+
+      std::shared_ptr<phi::Allocation> workspace_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> result_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> host_result_allocation =
+          AllocShared(platform::CPUPlace(), workspace_size_);
+
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0,
+                                            workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0,
+                                           workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#endif
+
+      streams_.emplace_back(stream);
+      workspaces_.emplace_back(workspace_allocation);
+      results_.emplace_back(result_allocation);
+      host_results_.emplace_back(host_result_allocation);
+    }
+  }
+
+  void SingleStreamRun(size_t idx) {
+    int *y = reinterpret_cast<int *>(results_[idx]->ptr());
+    int neighbouring_idx = idx > 0 ? idx - 1 : idx;
+
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[idx]->ptr()), y, data_num_);
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[neighbouring_idx]->ptr()), y,
+        data_num_);
+    RecordStream(workspaces_[neighbouring_idx], streams_[idx]);
+  }
+
+  void MultiStreamRun() {
+    // Must run in reverse order, or the workspace_[i - 1] will be released
+    // before streams_[i]'s kernel launch
+    for (int i = stream_num_ - 1; i >= 0; --i) {
+      SingleStreamRun(i);
+      workspaces_[i].reset();  // fast GC
+    }
+  }
+
+  void MultiThreadMultiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    workspaces_.clear();
+  }
+
+  void CUDAGraphRun() {
+    testing_cuda_graph_ = true;
+    platform::BeginCUDAGraphCapture(platform::CUDAPlace(),
+                                    cudaStreamCaptureModeGlobal);
+
+    std::shared_ptr<Allocation> data_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+    std::shared_ptr<Allocation> result_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+
+    int *data = static_cast<int *>(data_allocation->ptr());
+    int *result = static_cast<int *>(result_allocation->ptr());
+
+    gpuStream_t main_stream = GetStream(data_allocation);
+    gpuStream_t other_stream;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream));
+
+    add_kernel<<<grid_num_, block_num_, 0, main_stream>>>(data, result,
+                                                          data_num_);
+    RecordStream(data_allocation, other_stream);
+
+    std::unique_ptr<platform::CUDAGraph> cuda_graph =
+        platform::EndCUDAGraphCapture();
+
+    int replay_times = 10;
+    for (int i = 0; i < replay_times; ++i) {
+      cuda_graph->Replay();
+    }
+
+    std::shared_ptr<Allocation> host_result_allocation =
+        AllocShared(platform::CPUPlace(), workspace_size_);
+    Copy(host_result_allocation->place(), host_result_allocation->ptr(),
+         result_allocation->place(), result_allocation->ptr(), workspace_size_,
+         main_stream);
+    cudaStreamSynchronize(main_stream);
+
+    int *host_result = static_cast<int *>(host_result_allocation->ptr());
+    for (int i = 0; i < data_num_; ++i) {
+      EXPECT_EQ(host_result[i], replay_times);
+    }
+
+    data_allocation.reset();
+    result_allocation.reset();
+    cuda_graph.release();
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream));
+  }
+
+  void CheckResult() {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      Copy(host_results_[i]->place(), host_results_[i]->ptr(),
+           results_[i]->place(), results_[i]->ptr(), workspace_size_,
+           streams_[i]);
+    }
+    cudaDeviceSynchronize();
+
+    size_t thread_num = grid_num_ * block_num_;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *result = static_cast<int *>(host_results_[i]->ptr());
+      for (size_t j = 0; j < data_num_; ++j) {
+        EXPECT_EQ(result[j], 2);
+      }
+    }
+  }
+
+  void TearDown() override {
+    workspaces_.clear();
+    results_.clear();
+    host_results_.clear();
+    for (gpuStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
+    }
+
+    // Memory release for CUDA Graph memory pool is forbidden
+    if (!testing_cuda_graph_) {
+      CheckMemLeak(place_);
+    }
+  }
+
+  bool testing_cuda_graph_{0};
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  size_t workspace_size_;
+  platform::CUDAPlace place_;
+  std::vector<gpuStream_t> streams_;
+  std::vector<std::shared_ptr<phi::Allocation>> workspaces_;
+  std::vector<std::shared_ptr<phi::Allocation>> results_;
+  std::vector<std::shared_ptr<phi::Allocation>> host_results_;
+};
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  MultiStreamRun();
+  CheckResult();
+}
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  MultiThreadMultiStreamRun();
+  CheckResult();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
+  MultiStreamRun();
+  CUDAGraphRun();
+  CheckResult();
+}
+#endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 91a0352e1915e95378012aa398ff996cbc10f216..e77be832c0cc8975c3fc2ebb7fad577cdfe919f5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling segment_pooling executor device_memory_aligment generator)
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index c28026a4bd43aac5b0c447e24a164e27233076e8..e1460629fb18a4259731c2c9de4ed8f623b5a1e4 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 0ac29e6d3ada7335cab510ef82c9f46d2da7eb05..b4a97e24cf29233776b19aa0ea7764a00435f6fc 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
       : CudnnActivationGradFunctor<T>(ctx, 6.0,
                                       GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename Functor>
@@ -197,7 +205,8 @@ class CudnnActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
+    static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut,
+                  "Forward deps must be Out.");
 
     const framework::Tensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 73d65b7c6e7e0a5be2d680afba971d54b492c05d..4205f2253a652ccc5f6d4886df1b1194f5e5062f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -34,7 +34,8 @@ using paddle::framework::Tensor;
 
 template <typename GradFunctor>
 static constexpr bool CanInplaceAct() {
-  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
+  return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut ||
+         GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps;
 }
 
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
@@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DOut")) {
         ctx->ShareDim("Out", "DOut");
         ctx->ShareLoD("Out", "DOut");
@@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("X", "DDOut");
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("Out", "DDOut");
         ctx->ShareLoD("Out", "DDOut");
@@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("D_DOut")) {
         ctx->ShareDim("Out", "D_DOut");
         ctx->ShareLoD("Out", "D_DOut");
@@ -1464,6 +1471,21 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor)
+REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor);
+REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor);
+REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor);
+REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor);
+REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor);
+REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor);
+REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
+REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
+REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
+REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
+                       ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+
 /* ==========================    sigmoid register  =============================
  */
 // 1. Register Sigmoid Operator
@@ -1548,23 +1570,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>,
     ops::ActivationTripleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================    relu register  ============================= */
@@ -1584,16 +1589,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::ReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== leaky relu register  ============================ */
@@ -1614,16 +1609,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                               LeakyReluGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ========================    elu  register     ============================ */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ff41da86f7bb6ba8406d58804888b5dcd8bc3be0..b076db01c22c62b17fdd85b7208467eea1375fed 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -35,16 +35,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
 using framework::To32BitIndex;
 
-enum ActBwdOpFwdDeps {
-  kNoDeps = 0x00,  // Do not need any forward input/output
-  kDepX = 0x01,    // Only need forward input X
-  kDepOut = 0x02,  // Only need forward output Out
-};
+using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps;
 
 /* The following operator can be used to process SelectedRows, because the
  * output of those operator for zero is zero too.
@@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor(
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
   const framework::Variable* out_var = nullptr;
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     out_var = context.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var, platform::errors::NotFound(
@@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor(
                               "Output(Out), variable name = %s",
                               context.OutputName(framework::GradVarName("X"))));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound(
                                        "Cannot get the tensor from the "
@@ -248,6 +247,39 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+#define USE_PHI_FUNCTOR(name)                         \
+  template <typename T>                               \
+  using name##Functor = phi::funcs::name##Functor<T>; \
+  template <typename T>                               \
+  using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
+
+#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##GradGradFunctor = phi::funcs::name##GradGradFunctor<T>;
+
+#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor<T>;
+
+USE_PHI_FUNCTOR(Cos)
+USE_PHI_FUNCTOR(Tan)
+USE_PHI_FUNCTOR(Acos)
+USE_PHI_FUNCTOR(Sin)
+USE_PHI_FUNCTOR(Asin)
+USE_PHI_FUNCTOR(Atan)
+USE_PHI_FUNCTOR(Sinh)
+USE_PHI_FUNCTOR(Cosh)
+USE_PHI_FUNCTOR(Asinh)
+USE_PHI_FUNCTOR(Acosh)
+USE_PHI_FUNCTOR(Atanh)
+USE_PHI_FUNCTOR(Tanh)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_FUNCTOR(BRelu)
+USE_PHI_FUNCTOR(ThresholdedRelu)
+USE_PHI_FUNCTOR(LeakyRelu)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
+
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
@@ -256,7 +288,9 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -293,7 +327,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -351,7 +387,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
           (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // silu(x) = x / (1 + exp(-x))
@@ -376,7 +414,7 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
                            (static_cast<T>(1) + (temp2 / temp1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
@@ -414,7 +452,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
         dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // exp(x) = e^x
@@ -434,7 +472,9 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // expm1(x) = e^x - 1
@@ -454,143 +494,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // relu(x) = max(x, 0)
-template <typename T>
-struct ReluCPUFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
-      return v > static_cast<T>(0) ? v : static_cast<T>(0);
-    });
-  }
-};
 
 template <typename T>
-struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0));
-  }
-};
-
+using ReluCPUFunctor = phi::funcs::ReluCPUFunctor<T>;
 template <typename T>
-struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
+using ReluGradFunctor = phi::funcs::ReluGradFunctor<T>;
 
-// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.tanh();
-  }
-};
+using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 
 template <typename T>
-struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
-    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
-    // * ddx)
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
-      dout_new.device(*d) =
-          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> TanhTripleGrad ->    D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (-2) * Out * DDx * D_Dout_new
-    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
-    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
-      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
-                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
-      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
-      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
-                         static_cast<T>(2) * out * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
+using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
 // tanhshrink(x) = x - tanh(x)
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
@@ -610,7 +530,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -646,7 +566,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
@@ -682,7 +602,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // sqrt(x) = x^(1/2)
@@ -702,7 +622,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // rsqrt(x) = x^(-1/2)
@@ -722,7 +644,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // ceil(x) = ceiling(x)
@@ -742,7 +666,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0) * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
+  }
 };
 
 // floor(x) = flooring(x)
@@ -754,373 +680,6 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct Sine {
-  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
-};
-
-template <>
-struct Sine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sin(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosine {
-  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
-};
-
-template <>
-struct Cosine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(cos(static_cast<float>(val)));
-  }
-};
-
-// cosine'(x) = -sin(x)
-template <typename T>
-struct CosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosine(x) = cos(x)
-template <typename T>
-struct CosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosine<T>());
-  }
-};
-
-// sine'(x) = cos(x)
-template <typename T>
-struct SinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// sine(x) = sin(x)
-template <typename T>
-struct SinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sine<T>());
-  }
-};
-
-template <typename T>
-struct Tangent {
-  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
-};
-
-template <>
-struct Tangent<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(tan(static_cast<float>(val)));
-  }
-};
-
-// Tangent'(x) = -Tangent(x)
-template <typename T>
-struct TanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// Tangent(x) = tan(x)
-template <typename T>
-struct TanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Tangent<T>());
-  }
-};
-
-template <typename T>
-struct Sinh {
-  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
-};
-
-template <>
-struct Sinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sinhf(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosh {
-  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
-};
-
-template <>
-struct Cosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(coshf(static_cast<float>(val)));
-  }
-};
-
-// sinh(x) = sinh(x)
-template <typename T>
-struct SinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sinh<T>());
-  }
-};
-
-// cosh(x) = cosh(x)
-template <typename T>
-struct CoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosh<T>());
-  }
-};
-
-// sinh'(x) = cosh(x)
-template <typename T>
-struct SinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosh'(x) = sinh(x)
-template <typename T>
-struct CoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acos {
-  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
-};
-
-template <>
-struct Acos<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acos(static_cast<float>(val)));
-  }
-};
-
-// Acos(x) = acos(x)
-template <typename T>
-struct AcosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acos<T>());
-  }
-};
-
-// acos'(x) = -1/sqrt(1-x^2)
-template <typename T>
-struct AcosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asin {
-  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
-};
-
-template <>
-struct Asin<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asin(static_cast<float>(val)));
-  }
-};
-
-// Asin(x) = asin(x)
-template <typename T>
-struct AsinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asin<T>());
-  }
-};
-
-// asin'(x) = 1/sqrt(1-x^2)
-template <typename T>
-struct AsinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atan {
-  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
-};
-
-template <>
-struct Atan<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atan(static_cast<float>(val)));
-  }
-};
-
-// Atan(x) = atan(x)
-template <typename T>
-struct AtanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atan<T>());
-  }
-};
-
-// atan'(x) =  1 / (1 + x^2)
-template <typename T>
-struct AtanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acosh {
-  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
-};
-
-template <>
-struct Acosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acosh(static_cast<float>(val)));
-  }
-};
-
-// Acosh(x) = acosh(x)
-template <typename T>
-struct AcoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acosh<T>());
-  }
-};
-
-// acosh'(x) =  1/sqrt(x^2 - 1)
-template <typename T>
-struct AcoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asinh {
-  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
-};
-
-template <>
-struct Asinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asinh(static_cast<float>(val)));
-  }
-};
-
-// Asinh(x) = asinh(x)
-template <typename T>
-struct AsinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asinh<T>());
-  }
-};
-
-// asinh'(x) =  1/sqrt(x^2 + 1)
-template <typename T>
-struct AsinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atanh {
-  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
-};
-
-template <>
-struct Atanh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atanh(static_cast<float>(val)));
-  }
-};
-
-// Atanh(x) = atanh(x)
-template <typename T>
-struct AtanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atanh<T>());
-  }
-};
-
-// atanh'(x) =  1/(1 - x^2)
-template <typename T>
-struct AtanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1147,7 +706,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // log(x) = natural logarithm of x
@@ -1167,7 +728,7 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log2(x) = logarithm to the base 2 of the elements of x
@@ -1188,7 +749,7 @@ struct Log2GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log10(x) = logarithm to the base 10 of the elements of x
@@ -1209,7 +770,7 @@ struct Log10GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log1p(x) = natural logarithm of x+1
@@ -1229,7 +790,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // square(x) = x^2
@@ -1249,43 +810,7 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(2) * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct BReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
-  }
-};
-
-template <typename T>
-struct BReluGradFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // relu6(x) = min(max(0, x), 6)
@@ -1319,7 +844,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
             .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // HardSwish = min(max(0, x+3), 6) * x / 6
@@ -1364,7 +891,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
          static_cast<T>(1) * (static_cast<T>(1) - tmp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // For numerical stability, using the following formula instead of softplus(x) =
@@ -1409,7 +936,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
             .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // mish(x) = x * tanh(softplus(x))
@@ -1449,7 +976,7 @@ struct MishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softsign(x) = x / (1 + |x|)
@@ -1472,7 +999,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
         dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1504,42 +1031,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct LeakyReluFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    if (alpha < 1.f) {
-      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
-    } else {
-      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
-    }
-  }
-};
-
-template <typename T>
-struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 =
-        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
-    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1573,7 +1067,7 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * (out + static_cast<T>(alpha)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1592,7 +1086,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * static_cast<T>(alpha) * x.exp());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1672,7 +1166,7 @@ struct CELUGradFunctor : public BaseActivationFunctor<T> {
         dout * (x / static_cast<T>(alpha)).exp() * temp_a_neg * temp_x_neg;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
@@ -1701,7 +1195,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
                    x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1766,38 +1260,7 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto th = static_cast<T>(threshold);
-    out.device(d) = (x > th).template cast<T>() * x;
-  }
-};
-
-template <typename T>
-struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto th = static_cast<T>(threshold);
-    dx.device(d) = dout * (x > th).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1832,7 +1295,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
                    static_cast<T>(slope);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1865,121 +1330,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-/*
- * in arguments: x, out, ddx
- * out arguments: ddout, dout, dx
- */
-template <ActBwdOpFwdDeps kDepValue>
-inline void ExtractActivationDoubleGradTensor(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** Out, const framework::Tensor** ddX,
-    framework::Tensor** dX, framework::Tensor** dOut,
-    framework::Tensor** ddOut) {
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE_NOT_NULL(
-      ddx_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("DDX")));
-  if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-    *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
-    if (ddo_var) {
-      *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          ddo_var);
-    }
-  } else {
-    *ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      *ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      *ddX,
-      platform::errors::NotFound(
-          "Cannot get the tensor from the Variable Output, variable name = %s",
-          ctx.OutputName("DDX")));
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
-    auto x_var = ctx.InputVar("X");
-    PADDLE_ENFORCE_NOT_NULL(
-        x_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("X")));
-    auto dx_var = ctx.OutputVar("DX");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-      if (dx_var) {
-        *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-            dx_var);
-      }
-    } else {
-      *X = ctx.Input<framework::Tensor>("X");
-      if (dx_var) {
-        *dX = ctx.Output<framework::Tensor>("DX");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *X = *ddX;
-  }
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Cannot get the tensor from the Variable Out, variable name = %s",
-            ctx.InputName("Out")));
-    auto dout_var = ctx.OutputVar("DOut");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *Out =
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
-      if (dout_var) {
-        *dOut =
-            paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-                dout_var);
-      }
-    } else {
-      *Out = ctx.Input<framework::Tensor>("Out");
-      if (dout_var) {
-        *dOut = ctx.Output<framework::Tensor>("DOut");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *Out = *ddX;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *Out, *ddX;
-    X = Out = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut, *dX;
-    ddOut = dOut = dX = nullptr;
-
-    ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
-                                                          &dX, &dOut, &ddOut);
-
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
-    if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, Out, ddX, ddOut, dOut, dX);
-  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2000,57 +1351,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * x.sign();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
-      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    if (ddOut) {
-      auto* d = dev.eigen_device();
-      auto ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto x = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) =
-          ddx *
-          ((x > static_cast<T>(0)).template cast<T>() +
-           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
-              .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2088,7 +1389,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2127,7 +1428,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2156,7 +1457,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2185,7 +1488,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2214,7 +1519,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(2) * x;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
@@ -2840,7 +2145,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
     }
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 }  // namespace operators
@@ -2849,26 +2154,14 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
   __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
-  __macro(tan, Tan, TanFunctor, TanGradFunctor);                              \
-  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
-  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
-  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
-  __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor);                          \
-  __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
-  __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);                      \
-  __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);                      \
-  __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);                      \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
   __macro(log2, Log2, Log2Functor, Log2GradFunctor);                          \
   __macro(log10, Log10, Log10Functor, Log10GradFunctor);                      \
-  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
   __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
   __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
   __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
@@ -2879,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
   __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
           HardSigmoidGradFunctor);                                            \
   __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
-  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
-          ThresholdedReluGradFunctor);                                        \
   __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index e1afb3919f813b756e228e37413166ad3f95d6df..256f20db08445e8b8d5933aa0e3151f69fcb5b10 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -18,60 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : static_cast<T>(alpha) * x;
-  }
-};
-
-template <typename T>
-struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > zero ? dout : static_cast<T>(alpha) * dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 template <typename T>
 struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -93,7 +39,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return dout * out * (one - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -122,7 +70,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -159,30 +107,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -219,7 +144,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x >= -l && x <= l) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -262,295 +187,11 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
-};
-
-template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
-  }
-};
-
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
-  }
-};
-
-template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
-  }
-};
-
-template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
-  }
-};
-
-template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
-  }
-};
-
-template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
-  }
-};
-
-template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaTanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * (one - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
-  }
-};
-
-template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
   }
 };
 
-template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
@@ -566,7 +207,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
     return -dout * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -587,7 +230,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
     return dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -608,7 +253,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
     return dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -629,7 +276,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
     return dout / x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -647,7 +294,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
     return dout * two * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -670,7 +317,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
     return one_half * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -693,7 +342,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
     return minus_one_half * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -717,7 +368,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
     return dout / (one + x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -741,7 +392,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_two);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -765,46 +416,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_ten);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaBReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    T temp_max = x > t_min_cast ? x : t_min_cast;
-    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -849,7 +461,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
                                  : static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -893,7 +507,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * a * b * (one - temp * temp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -939,7 +553,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
     return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -962,7 +576,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
     return dout / (temp * temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -996,7 +610,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < t) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1022,7 +638,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1056,7 +672,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x > -t && x < t) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1097,7 +713,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1141,7 +759,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 + temp3));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1190,39 +808,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > static_cast<T>(threshold) ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > static_cast<T>(threshold) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1274,7 +860,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
     return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1320,7 +906,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1347,7 +935,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1429,7 +1017,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
          temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename Functor>
@@ -1477,13 +1065,14 @@ class ActivationGradCudaKernel
     std::vector<const framework::Tensor*> ins = {d_out};
     std::vector<framework::Tensor*> outs = {d_x};
 
-    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+    if (static_cast<int>(Functor::FwdDeps()) ==
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
                                                                 &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
-               static_cast<int>(kDepX)) {
+               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       // Only need forward input X
       ins.push_back(x);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
@@ -1495,6 +1084,22 @@ class ActivationGradCudaKernel
   }
 };
 
+USE_PHI_FUNCTOR(CudaCos)
+USE_PHI_FUNCTOR(CudaTan)
+USE_PHI_FUNCTOR(CudaAcos)
+USE_PHI_FUNCTOR(CudaSin)
+USE_PHI_FUNCTOR(CudaAsin)
+USE_PHI_FUNCTOR(CudaAtan)
+USE_PHI_FUNCTOR(CudaSinh)
+USE_PHI_FUNCTOR(CudaCosh)
+USE_PHI_FUNCTOR(CudaAsinh)
+USE_PHI_FUNCTOR(CudaAcosh)
+USE_PHI_FUNCTOR(CudaAtanh)
+USE_PHI_FUNCTOR(CudaTanh)
+USE_PHI_FUNCTOR(CudaBRelu)
+USE_PHI_FUNCTOR(CudaLeakyRelu)
+USE_PHI_FUNCTOR(CudaThresholdedRelu)
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -1509,7 +1114,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<double>>,                         \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1517,7 +1124,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<double>>,                \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
                                             grad_functor)                      \
@@ -1531,7 +1140,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<int64_t>>,                        \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1543,21 +1154,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<int64_t>>,               \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
-
-/* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                                CudaLeakyReluGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 /* ======================== elu register  ============================ */
 REGISTER_OP_CUDA_KERNEL(
@@ -1594,50 +1193,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    relu register  ============================ */
-#ifdef PADDLE_WITH_HIP
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
-                                CudaReluGradFunctor);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                    ops::CudaReluFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaReluFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::float16>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                             ops::CudaReluGradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::float16>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::bfloat16>>);
-#endif
-/* ========================================================================== */
-
 /* ===========================    sigmoid register  ============================
  */
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
@@ -1650,7 +1205,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidGradGradFunctor<double>>,
     ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+                                 ops::SigmoidGradGradFunctor<plat::float16>>,
+    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     sigmoid_triple_grad,
@@ -1659,30 +1216,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidTripleGradFunctor<double>>,
     ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================    tanh register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
-                                CudaTanhGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_grad_grad,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
+                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
+    ops::SigmoidTripleGradKernel<
+        plat::CUDADeviceContext,
+        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================   sqrt register  ============================= */
@@ -1696,7 +1233,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
                               ops::SqrtGradGradFunctor<double>>,
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
+                              ops::SqrtGradGradFunctor<plat::float16>>,
+    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::SqrtGradGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================   rsqrt register  =============================
@@ -1726,6 +1265,8 @@ REGISTER_OP_CUDA_KERNEL(
                                 ops::SquareGradGradFunctor<double>>,
     ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<plat::float16>>,
+    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
+                                ops::SquareGradGradFunctor<plat::bfloat16>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<int>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -1821,28 +1362,16 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
   __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
           CudaLogSigmoidGradFunctor);                                         \
-  __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor);                  \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
   __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor);               \
-  __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);                      \
-  __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);                      \
-  __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor);                  \
-  __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);                      \
-  __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor);                  \
-  __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor);                  \
-  __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor);                  \
-  __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor);              \
-  __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor);              \
-  __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor);              \
   __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
   __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
           CudaReciprocalGradFunctor);                                         \
   __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
   __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
   __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
-  __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor);              \
   __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
   __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
   __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
@@ -1856,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL(
           CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
   __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
-  __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
-          CudaThresholdedReluGradFunctor);                                    \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
@@ -1874,8 +1401,6 @@ FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
                                CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor,
-                               CudaReluGradFunctor);
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                CudaSigmoidGradFunctor);
 REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index de4d7818020dd586547ff9eedb53108285048c09..716a2e40179e404c2afcec31fb72cde7172f7e54 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -147,8 +147,8 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
-                            PT_INFER_META(phi::AddmmInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
+                            PD_INFER_META(phi::AddmmInferMeta));
 REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::framework::OpDesc>,
                   ops::AddMMOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 8fb9929c39e9223303f4427f1a0d7e1ed66134d4..88d7cb7c1f5f4bf47dc82f8632116424253d6d19 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -12,52 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/allclose_op.h"
 #include <cmath>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct AllcloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    *out_data = true;
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      *out_data &= val;
-    }
-  }
-};
-
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");
-
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({1}));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor,
+                            PD_INFER_META(phi::AllValueCompareInferMeta));
 REGISTER_OPERATOR(
     allclose, ops::AllcloseOp, ops::AllcloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::AllcloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel<CPU, float>,
-                       ops::AllcloseKernel<CPU, double>);
+    ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(allclose)
diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu
deleted file mode 100644
index 32c90ff8fdc109b30b140f0f70b336615ce93c17..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/allclose_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/allclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data,
-                                   const double rtol, const double atol,
-                                   bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct AllcloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, sizeof(bool));
-#else
-    cudaMemset(out_data, true, sizeof(bool));
-#endif
-    AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel<CUDA, float>,
-                        ops::AllcloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h
deleted file mode 100644
index 7a36754194ace5fad14d5a77e9d0be7f1c182087..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/allclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct AllcloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class AllcloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                        equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..237cfcc6f1172518097863158ca6dbd595af4186
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    MLUCnnlTensorDesc scale_desc(*scale);
+    MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY,
+                                     ToCnnlDataType<bool>());
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+
+      // check is_finite or is_nan
+      Tensor is_finite(found_inf->type());
+      if (i != 0) {
+        is_finite.Resize(phi::make_ddim({1}));
+        is_finite.mutable_data<bool>(ctx.GetPlace());
+      } else {
+        is_finite.ShareDataWith(*found_inf);
+      }
+
+      MLUCnnlTensorDesc x_desc(*x);
+
+      MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x),
+                        GetBasePtr(&is_finite));
+
+      // save is_finite by logical_and op after checking every input
+      if (i != 0) {
+        MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY,
+                                         ToCnnlDataType<bool>());
+        MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(),
+                       GetBasePtr(found_inf), is_finite_desc.get(),
+                       GetBasePtr(&is_finite), found_inf_desc.get(),
+                       GetBasePtr(found_inf));
+      }
+
+      // The normal logic is :
+      // out = in, if found_inf = true
+      // out = in/scale, if found_inf = false
+      // But when found_inf is true, the data of Out should not be used.
+      // So, on MLU, we always compute out with in/scale.
+      MLUCnnlTensorDesc out_desc(*out);
+      MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
+                   GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
+                   out_desc.get(), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleMLUKernel<float>,
+                       ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h
index f7aa0de97598df67817d81c1d1c1a5e8356f42ea..56aebe90788fbaa6c300ee9ac620c3d7613ff141 100644
--- a/paddle/fluid/operators/amp/fp16_type_traits.h
+++ b/paddle/fluid/operators/amp/fp16_type_traits.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -32,6 +33,12 @@ class MPTypeTrait<platform::float16> {
   using Type = float;
 };
 
+template <>
+class MPTypeTrait<platform::bfloat16> {
+ public:
+  using Type = float;
+};
+
 }  // namespace details
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 0f5c048b6be9c73ae98181685269592f409196cd..c5e4188ca2d6f749a06127c41da99490a7fb3ffc 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -15,23 +15,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
+
 REGISTER_OPERATOR(
     arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMaxInferShapeFunctor);
+
 REGISTER_OP_VERSION(arg_max)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
deleted file mode 100644
index b77031f7fb4c9d94f30ed06333b9c8766fd2310d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include <limits>
-#include <string>
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {  // NOLINT
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-using Tensor = framework::Tensor;
-
-}  // end namespace
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-template <typename T, typename IndType, class Reducer, size_t BlockDim>
-__global__ void ArgCUDAKernel(const int64_t height,     // n * h
-                              const int64_t width,      // c
-                              const int64_t post_size,  // h
-                              const Reducer reducer, const T init, const T* in,
-                              IndType* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      out[idx] = static_cast<IndType>(kv_pair.key);
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, typename IndType, class Reducer>
-void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
-                    Tensor* indices, const int64_t pre, const int64_t post,
-                    const int64_t n) {
-  auto cu_stream = ctx.stream();
-  auto ComputeBlockSize = [](int64_t col) {
-    auto block_size = 8;
-    if (col > 512)
-      block_size = 1024;
-    else if (col > 256)
-      block_size = 512;
-    else if (col > 128)
-      block_size = 256;
-    else if (col > 64)
-      block_size = 128;
-    else if (col > 32)
-      block_size = 64;
-    else if (col > 16)
-      block_size = 32;
-    else if (col > 8)
-      block_size = 16;
-#ifdef __HIPCC__
-    block_size = std::min(block_size, 256);
-#endif
-    return block_size;
-  };
-
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t height = pre * post;
-  int64_t width = n;
-  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-  const T* in_data = input.data<T>();
-  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  if (typeid(Reducer) == typeid(cub::ArgMax)) {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
-              in_data, out_data));
-    }
-  } else {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::max(),
-              in_data, out_data));
-    }
-  }
-}
-
-template <typename T, class Reducer>
-struct VisitDataCudaArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename IndType>
-  void apply() const {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int axis = ctx.Attr<int64_t>("axis");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    framework::DDim input_dims;
-    if (flatten) {
-      input_dims = phi::make_ddim({input->numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      input_dims = input->dims();
-      if (axis < 0) axis += input->dims().size();
-    }
-
-    int64_t numel = input->numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-
-    const auto& dev_ctx = ctx.cuda_device_context();
-    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
-  }
-};
-template <typename T, class Reducer>
-class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-  }
-};
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index d3ce61d183a3d322e40966ce59f9a10320ceab4f..585341beea12c14fbd01a3a47af34ce57def0db5 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -27,193 +27,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum ArgMinMaxType { kArgMin, kArgMax };
-
-template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
-          ArgMinMaxType argMinMaxValue>
-struct ArgMinMaxFunctor {};
-
-#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
-  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
-  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
-                          enum_argminmax_value> {                             \
-    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, framework::DDim x_dims,        \
-                    int64_t axis, bool keepdims) {                            \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
-      if (keepdims) {                                                         \
-        auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      } else {                                                                \
-        auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);  \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      }                                                                       \
-    }                                                                         \
-  }
-
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-struct VisitDataArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename Tout>
-  void apply() const {
-    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
-    out.template mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto keepdims = ctx.Attr<bool>("keepdims");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-    // paddle do not have the scalar tensor, just return the shape [1] tensor
-    if (flatten) keepdims = true;
-
-    // if flatten, will construct the new dims for the cacluate
-    framework::DDim x_dims;
-    if (flatten) {
-      x_dims = phi::make_ddim({x.numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      x_dims = x.dims();
-      if (axis < 0) axis += x_dims.size();
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
-  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
-      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
-
-    switch (x_dims.size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(
-            x_dims.size(), 6,
-            platform::errors::InvalidArgument(
-                "%s operator doesn't supports tensors whose ranks are greater "
-                "than 6.",
-                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-class ArgMinMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-  }
-};
-
-template <typename DeviceContext, typename T>
-using ArgMinKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMin>;
-
-template <typename DeviceContext, typename T>
-using ArgMaxKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMax>;
-
 class ArgMinMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max");
-    const auto& x_dims = ctx->GetInputDim("X");
-    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
-    bool keepdims = ctx->Attrs().Get<bool>("keepdims");
-    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
-
-    PADDLE_ENFORCE_GE(axis, -x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -Rank(X)(%d).",
-                          axis, -x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
-            x_dims.size()));
-
-    const int& dtype = ctx->Attrs().Get<int>("dtype");
-    PADDLE_ENFORCE_EQ(
-        (dtype < 0 || dtype == 2 || dtype == 3), true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
-    if (ctx->IsRuntime()) {
-      if (dtype == framework::proto::VarType::INT32) {
-        int64_t all_element_num = 0;
-        if (flatten) {
-          all_element_num = phi::product(x_dims);
-
-        } else {
-          all_element_num = x_dims[axis];
-        }
-        PADDLE_ENFORCE_LE(
-            all_element_num, INT_MAX,
-            platform::errors::InvalidArgument(
-                "The element num of the argmin/argmax input at axis is "
-                "%d, is larger than int32 maximum value:%d, you must "
-                "set the dtype of argmin/argmax to 'int64'.",
-                all_element_num, INT_MAX));
-      }
-    }
-    std::vector<int64_t> vec;
-    if (flatten) {
-      vec.emplace_back(static_cast<int64_t>(1));
-    } else {
-      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
-      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(vec));
-  }
 };
 
 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 0a4ba6fb0bfdfccfc4eae99da730e96fe5f0a540..fb3abd01af8c396d764f9f1d247f24c41bd15959 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
 
 REGISTER_OPERATOR(
     arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMinInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
 REGISTER_OP_VERSION(arg_min)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
deleted file mode 100644
index 23170bf0087906d752767051ce58874cb3584ee5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/arg_min_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-REGISTER_OP_CUDA_KERNEL(
-    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 9e525c20335d37242d0e239e81d2d2976b92a6b4..1a8aca777370bc140e39b7457702557042541744 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ArgsortOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort");
-
-    auto in_dims = ctx->GetInputDim("X");
-    int axis = ctx->Attrs().Get<int>("axis");
-
-    auto num_dims = in_dims.size();
-    PADDLE_ENFORCE_GE(axis, -num_dims,
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -num_dims(%d).",
-                          axis, -num_dims));
-    PADDLE_ENFORCE_LT(
-        axis, num_dims,
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareDim("X", "Indices");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
 };
 
 class ArgsortGradOp : public framework::OperatorWithKernel {
@@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor,
+                            PD_INFER_META(phi::ArgsortInferMeta));
 REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
                   ops::ArgsortGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>,
+                  ArgsortInferShapeFunctor);
 REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp,
                   ops::ArgsortGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(argsort,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    argsort_grad, ops::ArgsortGradientKernel<paddle::platform::CPUPlace, float>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, double>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
deleted file mode 100644
index 8b7a0b3eadb16bbe0822809748e343dc0d793a0f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.cu
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/argsort_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#ifdef __HIPCC__
-namespace rocprim {
-namespace detail {
-template <>
-struct radix_key_codec_base<paddle::platform::float16>
-    : radix_key_codec_integral<paddle::platform::float16, uint16_t> {};
-}  // namespace detail
-}  // namespace rocprim
-#else
-// set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-template <typename T>
-static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (T j = row_id; j < num_rows; j += gridDim.x) {
-    for (T i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
-                                       int64_t size, T* dX) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < size; i += stride) {
-    dX[indices[i]] = dO[i];
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
-                                IndType num_rows, IndType num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
-    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
-      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
-    }
-  }
-}
-
-// Sort by flag descending, True: descending. False: Ascending.
-// Default is false.
-template <typename T, typename IndType>
-void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
-                 Tensor* output, Tensor* indices, const IndType num_rows,
-                 const IndType num_cols, const bool descending) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-
-  const std::vector<IndType> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<IndType>(ctx.GetPlace());
-
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  // Init a index array
-  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<IndType>(), num_rows, num_cols);
-
-  T* sorted_out_ptr;
-  IndType* sorted_indices_ptr;
-
-  const T* inp = input->data<T>();
-  T* out = output->mutable_data<T>(ctx.GetPlace());
-  IndType* ind = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  sorted_out_ptr = out;
-  sorted_indices_ptr = ind;
-
-  // create iter for counting input
-  cub::CountingInputIterator<IndType> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<IndType, SegmentOffsetIter,
-                              cub::CountingInputIterator<IndType>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  gpuError_t err;
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-}
-
-template <typename T, typename IndType>
-void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                   const Tensor* indices, Tensor* dX, const IndType num_rows,
-                   const IndType num_cols) {
-  auto cu_stream = ctx.stream();
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<IndType>(), dX->data<T>(), num_rows,
-      num_cols);
-}
-
-template <typename T>
-void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                      const Tensor* indices, int64_t size, Tensor* dX) {
-  auto cu_stream = ctx.stream();
-
-  const int64_t block_size =
-      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
-  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
-  const int64_t max_blocks =
-      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
-  const int64_t grid_size =
-      std::min(max_blocks, (size + block_size - 1) / block_size);
-
-  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    auto size = input->numel();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    // Compared to the following 'Special case for full sort', ascending sort is
-    // 34 times faster and descending sort is 31 times faster.
-    if (size == in_dims[axis]) {
-      thrust::sequence(thrust::device, ids_data, ids_data + size);
-      thrust::copy(thrust::device, in_data, in_data + size, out_data);
-      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
-      if (descending) {
-        thrust::reverse(thrust::device, out_data, out_data + size);
-        thrust::reverse(thrust::device, ids_data, ids_data + size);
-      }
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      ArgFullSort<T, int64_t>(dev_ctx, input, output, indices, input_height,
-                              input_width, descending);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      T* trans_inp_data = trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-      // temp indices for sorting
-      tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-
-      ArgFullSort<T, int64_t>(dev_ctx, &trans_inp, &tmp_out, &tmp_indices,
-                              input_height, input_width, descending);
-
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                   output, trans);
-      return;
-    }
-  }
-};
-
-template <typename T>
-class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    if (dO->numel() == 0) return;
-
-    auto in_dims = dX->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    int64_t size = dX->numel();
-    const auto& dev_ctx = ctx.cuda_device_context();
-
-    // Parallel acceleration when the input size is equal to the length of the
-    // ‘axis’ dimension.
-    // Compared to 'special case for full sort' below, the gradient calculation
-    // is 10 times faster.
-    if (size == in_dims[axis]) {
-      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
-                                input_width);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
-                                                   &trans_dO, trans);
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      ArgFullAssign<T, int64_t>(dev_ctx, &trans_dO, &trans_ind, &tmp_out,
-                                input_height, input_width);
-
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                   trans);
-      return;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    argsort,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
-    paddle::operators::ArgsortGradOpCUDAKernel<double>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
deleted file mode 100644
index d850e51a4bf061d3e5fc46bd53a2ef56610d6de9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/argsort_op.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename Type>
-static void FullSort(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     bool descending) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                if (descending)
-                  return l.first > r.first;
-                else
-                  return l.first < r.first;
-              });
-
-    for (Type j = 0; j < input_width; ++j) {
-      t_out[i * input_width + j] = col_vec[j].first;
-      t_indices[i * input_width + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullAssign(Type input_height, Type input_width, int input_dim,
-                       const framework::Tensor* input,
-                       const framework::Tensor* indices, T* t_out) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-    // Do full sort
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           out_data, ids_data, descending);
-    } else {
-      // If not full sort do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, descending);
-
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ArgsortGradientKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    // Do full assign
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(), dO,
-                             indices, dX->data<T>());
-    } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *dO,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                  trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 077be715bece0b4119dc0a578a1cba4631eb45f2..c927eec00bc8bf9e84ad1fb53a907ff8ec71acbc 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
index 18e81936a16c63a1d2693dfb47dc618c3e707ae0..359b00fcf87ee1bee27e668ae3973fa39be19d76 100644
--- a/paddle/fluid/operators/argsort_op_xpu.cc
+++ b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 72488a932d9c33cbfeddc9f35818e42ebe0137fa..b452dea8536dd98d6d4060d5224e39daf9137c50 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc
index 71a895c244c54f62c0af1745635c08fea35436c4..0783b30a8580db403255211d879d9400a1e82ab7 100644
--- a/paddle/fluid/operators/atan2_op.cc
+++ b/paddle/fluid/operators/atan2_op.cc
@@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
-                            PT_INFER_META(phi::Atan2InferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
+                            PD_INFER_META(phi::Atan2InferMeta));
 REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker,
                   ops::Atan2GradMaker<paddle::framework::OpDesc>,
                   ops::Atan2GradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index a23e484d0a88bb87febc6d320f9183ef50ea0ebc..78ea8b6b6fbebd7e0ca5ce14cc2cba6ff197177f 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -269,10 +269,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
   if (bias) {
-    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::avx>(n, y, y);
+    phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y);
+    phi::funcs::vec_relu<T, platform::avx>(n, y, y);
   } else {
-    math::vec_relu<T, platform::avx>(n, x, y);
+    phi::funcs::vec_relu<T, platform::avx>(n, x, y);
   }
 }
 
@@ -283,14 +283,14 @@ inline void vec_softmax(const int n, const T* x, T* y) {
   for (int i = 1; i < n; ++i) {
     scalar = scalar < x[i] ? x[i] : scalar;
   }
-  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                               // exp
+  phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
+  phi::funcs::vec_exp<T>(n, y, y);                               // exp
   // sum
   scalar = T(0);
   for (int i = 0; i < n; ++i) {
     scalar += y[i];
   }
-  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
+  phi::funcs::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }
 
 template <typename T>
@@ -344,12 +344,12 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
     auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
     if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
+      phi::funcs::VecActivations<T, platform::avx> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
     } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 949cf021cf0fa322970c210fa26f698fd2bc45b2..174207deb08b84194d6f20fe04e4c27245295caf 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
                   ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
                   ops::BatchNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index d59396db1517faadaa2dd9e9af770d2e8a23ec56..a19b087245a89a4a12f062b1ce27835b98ecfd66 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-template <typename T, framework::DataLayout layout>
-static __global__ void BNForwardInference(
-    const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int num = N * C * HxW;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> x_sub_mean =
-        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
-    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
-    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
-    const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, double exponentialAverageFactor, T *y,
-    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      x_sum += x_i;
-      x_square_sum += x_i * x_i;
-    }
-    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-    x_square_sum =
-        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      mean_val = x_sum / inner_size;
-      variance_val = x_square_sum / inner_size - mean_val * mean_val;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    bool test_mode = is_test && (!trainable_stats);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5"
-            "But received: the size of input's dimensions is [%d]",
-            x_dims.size()));
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        test_mode ||
-        (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
-
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_y(y->type());
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
-                                                           &transformed_y);
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-    } else if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#else
-    if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(
-//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_,
-        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    // Now, depending on whether we are running test or not, we have two paths.
-    // It is training mode when it's not reference AND not using pre-trained
-    // model.
-    bool training = !test_mode && !use_global_stats;
-    if (!training) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of mean's dimensions must equal to 1."
-              "But received: the size of mean's dimensions mean is [%d],"
-              "the dimensions of mean is [%s].",
-              est_mean->dims().size(), est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of variance's dimensions must equal to 1."
-              "But received: the size of variance's dimensions is [%d],"
-              "the dimensions of variance is [%s].",
-              est_var->dims().size(), est_var->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of mean must equal to the number of "
-              "Channels, which is [%d]. But received: the first dimension"
-              "of mean is [%d], the dimensions of mean is [%s].",
-              C, est_mean->dims()[0], est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of variance must equal to the number"
-              "of Channels, which is [%d]. But received: the first dimension of"
-              "variance is [%d], the dimensions of variance is [%s].",
-              C, est_var->dims()[0], est_var->dims()));
-
-#ifdef PADDLE_WITH_HIP
-      const int block_size = 256;
-      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
-      if (compute_format == DataLayout::kNCHW) {
-        BNForwardInference<
-            T,
-            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      } else {
-        BNForwardInference<
-            T,
-            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardInference(
-//         handle, miopenBNSpatial,
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_mean->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_var->template data<BatchNormParamType<T>>())),
-//         epsilon));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationForwardInference(
-              handle,
-              // Note: PERSISTENT not implemented for inference
-              CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_,
-              transformed_x.template data<T>(), data_desc_,
-              transformed_y.template mutable_data<T>(ctx.GetPlace()),
-              bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-              bias->template data<BatchNormParamType<T>>(),
-              est_mean->template data<BatchNormParamType<T>>(),
-              est_var->template data<BatchNormParamType<T>>(), epsilon));
-#endif
-    } else {
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        Tensor mom_cpu;
-        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
-                                          &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      // Run training mode.
-      // obtain running mean and running inv var, and there is no need
-      // to initialize them.
-
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-      } else {
-        double this_factor = 1. - momentum;
-
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        size_t reserve_space_size = 0;
-        void *reserve_space_ptr = nullptr;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        // Create reserve space and workspace for batch norm.
-        // Create tensor for each batchnorm op, it will be used in the
-        // backward. Thus this tensor shouldn't be temp.
-        auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
-        PADDLE_ENFORCE_NOT_NULL(
-            reserve_space,
-            platform::errors::NotFound(
-                "The argument ReserveSpace of batch_norm op is not found."));
-
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*zDesc=*/nullptr,
-                    /*yDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*activationDesc=*/nullptr,
-                    /*xDesc=*/data_desc_,
-                    /*sizeInBytes=*/&reserve_space_size));
-
-        reserve_space_ptr = reserve_space->mutable_data(
-            ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(), data_desc_,
-                transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
-                transformed_y.template data<T>(), bn_param_desc_,
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
-                reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          const int num = transformed_x.numel();
-          const int block = 256;
-          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-          const int max_blocks = std::max(max_threads / block, 1);
-          const int grid = std::min(C, max_blocks);
-          if (compute_format == DataLayout::kNCHW) {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          } else {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardTraining(
-//         handle, mode_, const_cast<void *>(static_cast<const void *>(
-//                            CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         this_factor,
-//         static_cast<void *>(
-//             mean_out->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(variance_out->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace())),
-//         epsilon,
-//         static_cast<void *>(
-//             saved_mean->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(saved_variance->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationForwardTraining(
-                  handle, mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_y.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  bias->template data<BatchNormParamType<T>>(), this_factor,
-                  mean_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  variance_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon,
-                  saved_mean->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  saved_variance->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace())));
-#endif
-        }
-      }
-    }
-
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_y, y);
-    }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const double epsilon, const int N,
-    const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
-    BatchNormParamType<T> mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
-                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_sum * inv_var_i;
-      dbias[i] = db_sum;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
-template <typename T>
-static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
-                                       const BatchNormParamType<T> *scale,
-                                       const BatchNormParamType<T> *bias,
-                                       const BatchNormParamType<T> *mean,
-                                       const BatchNormParamType<T> *variance,
-                                       double epsilon, int C, int M,
-                                       const int num, const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
-    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
-    x[i] = static_cast<T>(x_i);
-  }
-}
-
-template <typename T>
-class InplaceHelper {
- public:
-  void operator()(const framework::DataLayout layout, T *x,
-                  const BatchNormParamType<T> *scale,
-                  const BatchNormParamType<T> *bias,
-                  const BatchNormParamType<T> *mean,
-                  const BatchNormParamType<T> *variance, double epsilon, int C,
-                  int M, const int num, const T *y, int grid2, const int block,
-                  const gpuStream_t &stream) {
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y should be inplaced in inplace mode"));
-    KeBNRestoreData<<<grid2, block, 0, stream>>>(
-        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
-    const T *dy, const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *saved_mean,
-    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
-    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> inv_var_val;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> dscale_val;
-  __shared__ BatchNormParamType<T> dbias_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    if (saved_mean && saved_inv_variance) {
-      if (threadIdx.x == 0) {
-        inv_var_val = saved_inv_variance[i];
-        mean_val = saved_mean[i];
-      }
-    } else {
-      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-      BatchNormParamType<T> x_square_sum =
-          static_cast<BatchNormParamType<T>>(0);
-
-      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-        const int index = layout == framework::DataLayout::kNCHW
-                              ? (j / HxW * C + i) * HxW + j % HxW
-                              : j * outer_size + i;
-        BatchNormParamType<T> x_i =
-            static_cast<BatchNormParamType<T>>(x[index]);
-        x_sum += x_i;
-        x_square_sum += x_i * x_i;
-      }
-      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-      x_square_sum =
-          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-      if (threadIdx.x == 0) {
-        mean_val = x_sum / inner_size;
-        inv_var_val =
-            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
-      }
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      ds_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
-      db_sum += dy_i;
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale_val = ds_sum * inv_var_val;
-      dbias_val = db_sum;
-      dscale[i] = dscale_val;
-      dbias[i] = dbias_val;
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] = scale[i] * inv_var_val *
-                  (static_cast<BatchNormParamType<T>>(dy[index]) -
-                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
-                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
-                       inv_var_val * dscale_val / inner_size);
-    }
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
-    const T *dy, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *mean, const T *x,
-    const BatchNormParamType<T> *variance, const int C, const int N,
-    const int HxW, T *dx) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> inv_var_i = variance[i];
-    BatchNormParamType<T> mean_i = mean[i];
-    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> dy_x_sub_mean_sum =
-        static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      dy_sum += dy_i;
-      dy_x_sub_mean_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-    }
-
-    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
-                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dy_sum_val = dy_sum;
-      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] =
-          (static_cast<BatchNormParamType<T>>(dy[index]) -
-           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
-           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
-               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
-          scale[i] * inv_var_i;
-    }
-  }
-}
-
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    const bool is_test = ctx.Attr<bool>("is_test");
-    use_global_stats = is_test || use_global_stats;
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5."
-            "But received: the size of input's dimensions is [%d],"
-            "the dimensions of input is [%s]",
-            x_dims.size(), x_dims));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(
-        scale->dims().size(), 1UL,
-        platform::errors::InvalidArgument(
-            "The size of scale's dimensions must equal to 1. But received: "
-            "the size of scale's dimensions is [%d], the dimensions of scale "
-            "is [%s].",
-            scale->dims().size(), scale->dims()));
-    PADDLE_ENFORCE_EQ(
-        scale->dims()[0], C,
-        platform::errors::InvalidArgument(
-            "The first dimension of scale must equal to Channels[%d]. But "
-            "received: the first dimension of scale is [%d]",
-            C, scale->dims()[0]));
-
-    auto dtype = platform::CudnnDataType<T>::type;
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
-        reserve_space != nullptr;
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_d_y(d_y->type());
-    Tensor transformed_d_x;
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                           &transformed_d_y);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                          &transformed_d_y);
-      if (d_x) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
-                                                             &transformed_d_x);
-      }
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_d_y.ShareDataWith(*d_y);
-      if (d_x) {
-        transformed_d_x.ShareDataWith(*d_x);
-      }
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int num = transformed_x.numel();
-#ifdef HIPCC
-    const int block = 256;
-#else
-    const int block = 512;
-#endif
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-    auto stream = dev_ctx.stream();
-    InplaceHelper<T> inplace_functor;
-
-    if (!use_global_stats) {
-      if ((N * H * W * D) == 1) {
-        if (d_x) {
-          framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-        }
-        phi::funcs::SetConstant<platform::CUDADeviceContext,
-                                BatchNormParamType<T>>
-            functor;
-        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-        return;
-      }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-      cudnnTensorDescriptor_t data_desc_;
-      cudnnTensorDescriptor_t bn_param_desc_;
-      cudnnBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-        LOG(ERROR) << "Provided epsilon is smaller than "
-                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                   << "CUDNN_BN_MIN_EPSILON instead.";
-      }
-      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-      } else if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#else
-      if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-//                                                       data_desc_, mode_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                           data_desc_, mode_));
-#endif
-
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-      const auto *saved_mean_data =
-          saved_mean->template data<BatchNormParamType<T>>();
-      const auto *saved_var_data =
-          saved_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        inplace_functor(compute_format, transformed_x.data<T>(),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        saved_mean_data, saved_var_data, epsilon, C, H * W * D,
-                        num, transformed_x.data<T>(), grid2, block, stream);
-      }
-
-      // This branch calls CUDNN APIs
-      if (d_x && d_scale && d_bias) {
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        auto reserve_space_size = reserve_space->memory_size();
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-                    /*handle=*/dev_ctx.cudnn_handle(),
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*yDesc=*/data_desc_,
-                    /*dyDesc=*/data_desc_,
-                    /*dzDesc=*/nullptr,
-                    /*dxDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationBackwardEx(
-                /*handle=*/dev_ctx.cudnn_handle(),
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-                /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-                /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-                /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-                /*xDesc=*/data_desc_,
-                /*xData=*/transformed_x.template data<T>(),
-                /*yDesc=*/nullptr,
-                /*yData=*/nullptr,
-                /*dyDesc=*/data_desc_,
-                /*dyData=*/transformed_d_y.template data<T>(),
-                /*dzDesc=*/nullptr,
-                /*dzData=*/nullptr,
-                /*dxDesc=*/data_desc_,
-                /*dxData=*/transformed_d_x.template mutable_data<T>(
-                    ctx.GetPlace()),
-                /*dBnScaleBiasDesc=*/bn_param_desc_,
-                /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
-                /*bnBiasData=*/nullptr,
-                /*dBnScaleData=*/d_scale
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*dBnBiasData=*/d_bias
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*epsilon=*/epsilon,
-                /*savedMean=*/saved_mean_data,
-                /*savedInvVariance=*/saved_var_data,
-                /*activationDesc=*/nullptr,
-                /*workspace=*/workspace_ptr,
-                /*workSpaceSizeInBytes=*/workspace_size,
-                /*reserveSpace=*/const_cast<T *>(
-                    reserve_space->template data<T>()),
-                /*reserveSpaceSizeInBytes=*/reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          if (compute_format == DataLayout::kNCHW) {
-            BNBackward<
-                T, block,
-                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          } else {
-            BNBackward<
-                T, block,
-                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationBackward(
-//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), data_desc_,
-//         transformed_x.template data<T>(), data_desc_,
-//         transformed_d_y.template data<T>(), data_desc_,
-//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-//         d_scale->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         d_bias->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         epsilon, saved_mean_data, saved_var_data));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
-#endif
-        }
-
-        if (data_layout == DataLayout::kNHWC &&
-            compute_format == DataLayout::kNCHW) {
-          VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-          TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-              ctx, &transformed_d_x, d_x);
-        }
-      } else {
-        // This branch call CUDA kernels
-        if (compute_format == DataLayout::kNCHW) {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        } else {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        }
-      }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-      // clean when exit.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_var = ctx.Input<Tensor>("Variance");
-
-      const auto *running_mean_data =
-          running_mean->template data<BatchNormParamType<T>>();
-      const auto *running_var_data =
-          running_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        auto px = *x;
-        inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        running_mean_data, running_var_data, epsilon, C,
-                        H * W * D, num, x->data<T>(), grid2, block, stream);
-      }
-
-      if (compute_format == DataLayout::kNCHW) {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      } else {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
-        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
-        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
-#endif
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 55bb57466c7b5ec4f4ac3c51b1cf84ab5098a0e9..bc9076f4d7c368f60187e9e432dd175d1f5ad45b 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
-                            PT_INFER_META(phi::BCELossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
+                            PD_INFER_META(phi::BCELossInferMeta));
 
 REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker,
                   ops::BCELossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 4774c0a1dbc3b78607d75efb7bc82d590ca4aa2a..9f6a78ab7a55f32558accd56e69d757003bad89c 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
+DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
                             BilinearTensorProductInferShapeFunctor,
-                            PT_INFER_META(phi::BilinearTensorProductInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(
+                            PD_INFER_META(phi::BilinearTensorProductInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
     bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor,
-    PT_INFER_META(phi::BilinearTensorProductGradInferMeta));
+    PD_INFER_META(phi::BilinearTensorProductGradInferMeta));
 
 REGISTER_OPERATOR(
     bilinear_tensor_product, ops::BilinearTensorProductOp,
diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc
index b37334a14bad4fdc342d8fba13c117bfad5bd65c..062e7d510d54c0f657582d48844093d94732971e 100644
--- a/paddle/fluid/operators/bincount_op.cc
+++ b/paddle/fluid/operators/bincount_op.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bincount_op.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BincountOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of BincountOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto minlength = ctx->Attrs().Get<int>("minlength");
-
-    PADDLE_ENFORCE_GE(minlength, 0,
-                      platform::errors::InvalidArgument(
-                          "The minlength should be greater than or equal to 0."
-                          "But received minlength is %d",
-                          minlength));
-
-    PADDLE_ENFORCE_EQ(input_dim.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) must be 1-D tensor."
-                          "But the dimension of Input(X) is [%d]",
-                          input_dim.size()));
-
-    if (ctx->HasInput("Weights")) {
-      auto weights_dim = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The 'shape' of Input(Weights) must be 1-D tensor."
-                            "But the dimension of Input(Weights) is [%d]",
-                            weights_dim.size()));
-
-      PADDLE_ENFORCE_EQ(
-          weights_dim[0], input_dim[0],
-          platform::errors::InvalidArgument(
-              "The 'shape' of Input(Weights) must be equal to the 'shape' of "
-              "Input(X)."
-              "But received: the 'shape' of Input(Weights) is [%s],"
-              "the 'shape' of Input(X) is [%s]",
-              weights_dim, input_dim));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type =
@@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor,
+                            PD_INFER_META(phi::BincountInferMeta));
 REGISTER_OPERATOR(
     bincount, ops::BincountOp, ops::BincountOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    bincount, ops::BincountKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    BincountInferShapeFunctor);
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
deleted file mode 100644
index cc576d0af92877dff44d672597596036be0defbc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bincount_op.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/bincount_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-inline int GET_BLOCKS(const int N) {
-  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
-}
-
-template <typename T, typename InputT, typename OutT>
-__global__ void KernelBincount(const InputT* input, const int total_elements,
-                               const bool has_weights, const T* weights,
-                               OutT* output) {
-  if (!has_weights) {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
-    }
-  } else {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]],
-                                      static_cast<OutT>(weights[i]));
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountCUDAInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  const int input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<T>(context.GetPlace());
-    return;
-  }
-  auto input_x = framework::EigenVector<InputT>::Flatten(*input);
-
-  framework::Tensor input_min_t, input_max_t;
-  auto* input_max_data =
-      input_max_t.mutable_data<InputT>({1}, context.GetPlace());
-  auto* input_min_data =
-      input_min_t.mutable_data<InputT>({1}, context.GetPlace());
-
-  auto input_max_scala = framework::EigenScalar<InputT>::From(input_max_t);
-  auto input_min_scala = framework::EigenScalar<InputT>::From(input_min_t);
-
-  auto* place = context.template device_context<DeviceContext>().eigen_device();
-  input_max_scala.device(*place) = input_x.maximum();
-  input_min_scala.device(*place) = input_x.minimum();
-
-  Tensor input_min_cpu, input_max_cpu;
-  paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
-                                    &input_max_cpu);
-  paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
-                                    &input_min_cpu);
-
-  InputT input_min = input_min_cpu.data<InputT>()[0];
-
-  PADDLE_ENFORCE_GE(
-      input_min, static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size =
-      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
-
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
-
-  auto stream =
-      context.template device_context<platform::CUDADeviceContext>().stream();
-
-  if (!has_weights) {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-
-    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        input_data, input_numel, has_weights, weights_data, output_data);
-  } else {
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-
-      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-
-      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
-                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountCUDAInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountCUDAInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bincount, ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
deleted file mode 100644
index 84256bf78e4a1901b76b356c5e3274541dc0dd59..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bincount_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  auto input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<InputT>(context.GetPlace());
-    return;
-  }
-
-  PADDLE_ENFORCE_GE(
-      *std::min_element(input_data, input_data + input_numel),
-      static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size = static_cast<int64_t>(*std::max_element(
-                            input_data, input_data + input_numel)) +
-                        1L;
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  if (has_weights) {
-    const T* weights_data = weights->data<T>();
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
-      }
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
-      }
-    }
-
-  } else {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-    for (int64_t i = 0; i < input_numel; i++) {
-      output_data[input_data[i]] += 1L;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index c3917fad555cb4633d4d958abcde0244fae13cae..1063a8b7992153dbedcdc0442ac3d8038c5e171b 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -167,9 +167,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
                             BroadcastTensorsInferShapeFunctor,
-                            PT_INFER_META(phi::BroadcastTensorsInferMeta));
+                            PD_INFER_META(phi::BroadcastTensorsInferMeta));
 
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 09e915a6bafd4a8b72f35995b3ebbfeafa00476a..ed80ac076c0af7fc8922f095d4be4613bc5057ec 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
-                            PT_INFER_META(phi::CholeskyInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskyInferMeta));
 REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
                   ops::CholeskyGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc
index 6b5bae8fc73fe2b71212a93144d89144dd0268c6..5403e2440ee58f1cf7cbad107f4d3e174655ed3b 100644
--- a/paddle/fluid/operators/cholesky_solve_op.cc
+++ b/paddle/fluid/operators/cholesky_solve_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker {
 class CholeskySolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve");
-    auto u_dims = context->GetInputDim("Y");
-    auto b_dims = context->GetInputDim("X");
-    int u_rank = u_dims.size();
-    int b_rank = b_dims.size();
-    PADDLE_ENFORCE_GE(u_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input Y must greater or equal to 2"));
-    PADDLE_ENFORCE_GE(b_rank, 2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input X must greater or equal to 2"));
-    PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2],
-                      platform::errors::InvalidArgument(
-                          "input Matrix Y should be square matrix,"
-                          "But Got last shape of %ld x %ld",
-                          u_dims[u_rank - 1], u_dims[u_rank - 2]));
-    PADDLE_ENFORCE_EQ(
-        b_dims[b_rank - 2], u_dims[u_rank - 2],
-        platform::errors::InvalidArgument(
-            "the first dim of input X must equal to the dim of input Y,"
-            "But Got %ld and %ld",
-            b_dims[b_rank - 2], u_dims[u_rank - 2]));
-
-    std::vector<int64_t> u_dims_vec = phi::vectorize(u_dims);
-    std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
-
-    std::vector<int64_t> u_dims_vec_cut(u_dims_vec.begin(),
-                                        u_dims_vec.end() - 2);
-    std::vector<int64_t> b_dims_vec_cut(b_dims_vec.begin(),
-                                        b_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut);
-
-    std::vector<int64_t> b_broadcast_dims({expand_batch_portion});
-    b_broadcast_dims.insert(b_broadcast_dims.end(),
-                            {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims));
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskySolveInferMeta));
+
 REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp,
                   ops::CholeskySolveOpMaker,
                   ops::CholeskySolveOpVarTypeInference,
                   ops::CholeskySolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>,
+                  CholeskySolveInferShapeFunctor);
 
 REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, double>);
-// Complex<> is not supported because of TensorExpand, which used to boardcast
-// input Tensor
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
deleted file mode 100644
index 1b551a7cd0343db32a84e962212a25e1ff5a4893..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_solve_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo,
-                    int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb,
-                    int *devInfo);
-
-template <>
-void cusolver_potrs<float>(const cusolverDnHandle_t &cusolverH,
-                           cublasFillMode_t uplo, int n, int nrhs, float *Adata,
-                           int lda, float *Bdata, int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<double>(const cusolverDnHandle_t &cusolverH,
-                            cublasFillMode_t uplo, int n, int nrhs,
-                            double *Adata, int lda, double *Bdata, int ldb,
-                            int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs(
-      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<float>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<float> *Adata, int lda, platform::complex<float> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs(
-      cusolverH, uplo, n, nrhs, reinterpret_cast<const cuComplex *>(Adata), lda,
-      reinterpret_cast<cuComplex *>(Bdata), ldb, devInfo));
-}
-
-template <>
-void cusolver_potrs<platform::complex<double>>(
-    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
-    platform::complex<double> *Adata, int lda, platform::complex<double> *Bdata,
-    int ldb, int *devInfo) {
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs(
-      cusolverH, uplo, n, nrhs,
-      reinterpret_cast<const cuDoubleComplex *>(Adata), lda,
-      reinterpret_cast<cuDoubleComplex *>(Bdata), ldb, devInfo));
-}
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-
-    /* step 1: get cusolver handle*/
-    auto cusolverH = dev_ctx.cusolver_dn_handle();
-
-    /* step 2: solve A0*X0 = B0  */
-    cusolver_potrs<T>(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda,
-                      devInfo);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-  }
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor &in, Tensor *out,
-                  const framework::ExecutionContext &ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_solve_grad,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
deleted file mode 100644
index f25fbbb0c698036951c4b9ae8e9ad2778786a1a2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
-
-namespace paddle {
-namespace operators {  // namespace operators
-
-template <typename DeviceContext, typename T>
-class CholeskySolveFunctor {
- public:
-  void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo);
-};
-
-template <typename T>
-class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
-                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
-    char uplo = upper ? 'U' : 'L';
-    phi::funcs::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
-                                       devInfo);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
-                       const framework::Tensor &uin,
-                       const framework::Tensor &bin, framework::Tensor *out,
-                       bool upper) {
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  // framework::Tensor broadcast
-  std::vector<int64_t> u_bst_dims_vec;
-  std::vector<int64_t> b_bst_dims_vec;
-  std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin);
-  framework::Tensor u_bst(uin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, uin, &u_bst, u_bst_dims_vec);
-
-  framework::Tensor b_bst(bin.type());
-  TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
-
-  math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
-
-  // calculate u's conjugate for complex
-  framework::Tensor u_conj(u_bst.type());
-  platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
-  phi::funcs::ConjFunctor<T> u_functor(
-      u_bst.data<T>(), u_bst.numel(),
-      u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
-  u_for_range(u_functor);
-  u_conj = helper.Transpose(u_conj);
-
-  // calculate b's conjugate for complex
-  framework::Tensor b_conj(b_bst.type());
-  platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
-  phi::funcs::ConjFunctor<T> b_functor(
-      b_bst.data<T>(), b_bst.numel(),
-      b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
-  b_for_range(b_functor);
-  b_conj = helper.Transpose(b_conj);
-
-  auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
-  auto uindims = u_bst.dims();
-  auto bindims = b_bst.dims();
-  int uinrank = uindims.size();
-  int binrank = bindims.size();
-
-  int n = uindims[uinrank - 2];
-  int nrhs = bindims[binrank - 1];
-  int ldab = std::max(1, n);
-
-  // framework::Tensor out_copy(b_conj.type());
-  // out_copy.Resize(b_conj.dims());
-  framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out);
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2);
-  auto batchsize = product(info_dims);
-
-  framework::Tensor tmp;
-  std::vector<int> tmpdim(1, batchsize);
-  tmp.Resize(phi::make_ddim(tmpdim));
-  int *info = tmp.mutable_data<int>(dev_ctx.GetPlace());
-
-  CholeskySolveFunctor<DeviceContext, T> functor;
-  for (int b = 0; b < batchsize; b++) {
-    auto uin_data_item = &ut_data[b * n * n];
-    auto out_data_item = &out_data[b * n * nrhs];
-    auto info_item = &info[b];
-    functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item,
-            info_item);
-  }
-
-  // calculate out's conjugate for complex
-  platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-  phi::funcs::ConjFunctor<T> out_functor(
-      out->data<T>(), out->numel(),
-      out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-  out_for_range(out_functor);
-  *out = helper.Transpose(*out);
-}
-
-template <typename DeviceContext, typename T>
-class CholeskySolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto upper = ctx.Attr<bool>("upper");
-    cholesky_solve_fn<DeviceContext, T>(ctx, *uin, *bin, out, upper);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CholeskySolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *bin = ctx.Input<framework::Tensor>("X");
-    auto *uin = ctx.Input<framework::Tensor>("Y");
-    auto *out = ctx.Input<framework::Tensor>("Out");
-    auto *dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *du = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto upper = ctx.Attr<bool>("upper");
-
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
-
-    std::vector<int64_t> u_bst_dims_vec;
-    std::vector<int64_t> b_bst_dims_vec;
-    std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin);
-    framework::Tensor u_bst(uin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *uin, &u_bst, u_bst_dims_vec);
-
-    framework::Tensor db_bst(bin->type());
-    TensorExpand<T, DeviceContext>(dev_ctx, *bin, &db_bst, b_bst_dims_vec);
-
-    if (dout) {
-      db->mutable_data<T>(dev_ctx.GetPlace());
-      cholesky_solve_fn<DeviceContext, T>(ctx, u_bst, *dout, &db_bst, upper);
-
-      if (db_bst.dims() == db->dims()) {
-        framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(db_bst, db, ctx);
-        db->Resize(bin->dims());
-      }
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-      // calculate out's conjugate for complex
-      framework::Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-      out_conj = helper.Transpose(out_conj);
-
-      framework::Tensor commonterm(out->type());
-      auto outdims = out_conj.dims();
-      auto dbdims = db_bst.dims();
-      auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false);
-      auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false);
-      auto cmtdim = outdims;
-      cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
-      commonterm.Resize(cmtdim);
-      commonterm.mutable_data<T>(dev_ctx.GetPlace());
-      blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast<T>(1),
-                  &commonterm, static_cast<T>(0));
-
-      // calculate commonterm's conjugate for complex
-      framework::Tensor commonterm_conj(commonterm.type());
-      platform::ForRange<DeviceContext> commonterm_for_range(
-          dev_ctx, commonterm.numel());
-      phi::funcs::ConjFunctor<T> commonterm_functor(
-          commonterm.data<T>(), commonterm.numel(),
-          commonterm_conj.mutable_data<T>(commonterm.dims(),
-                                          dev_ctx.GetPlace()));
-      commonterm_for_range(commonterm_functor);
-      commonterm_conj = helper.Transpose(commonterm_conj);
-
-      phi::AddRawKernel<T>(
-          static_cast<const typename paddle::framework::ConvertToPhiContext<
-              DeviceContext>::TYPE &>(dev_ctx),
-          commonterm, commonterm_conj, -1, &commonterm);
-
-      auto mat_dim_u =
-          phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false);
-
-      Tensor du_bst(uin->type());
-      // get upper or lower triangular
-      du_bst.Resize(u_bst.dims());
-      du_bst.mutable_data<T>(dev_ctx.GetPlace());
-      if (upper) {
-        blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      } else {
-        blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast<T>(-1),
-                    &du_bst, static_cast<T>(0));
-      }
-
-      const auto &udims = u_bst.dims();
-      const auto H = udims[udims.size() - 2];
-      const auto W = udims[udims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, u_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(du_bst.data<T>(), 0, !upper, H, W,
-                                            u_bst.data<T>());
-      x_for_range(tril_triu_computer);
-
-      du->mutable_data<T>(dev_ctx.GetPlace());
-      if (u_bst.dims() == du->dims()) {
-        framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(u_bst, du, ctx);
-        du->Resize(uin->dims());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index f1247ebdf23c8e00cdbfd662a160912a769d7558..2092f65212a6a71534e1ea9a6977abc94bf97b6a 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,9 +1,9 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn)
 
-SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
+SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
@@ -11,7 +11,7 @@ if (WITH_TESTING)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
-  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags)
   set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}")
 
   cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op)
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 7c4bdc09a569e455b20febef278003ada923dd79..0edbee534c0b5d680717250e7702f272eacd0272 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -22,11 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
@@ -50,7 +56,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) {
   auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp(
       "cinn_instruction_run", {{"X", {"x", "y"}}},
       {{"Out", {test_op_out_name}}},
-      {{"cached_index", 0}, {"instruction_index", 1}});
+      {{"cached_index", 0}, {"instruction_index", 0}});
   auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
       "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
       {{"Out", {add_op_out_name}}}, {{}});
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0a21d937aa1a70120e6112cdb291aa41eb222bb3..b76dd60409221eef9204f26319dabb20db4a36ac 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -31,6 +31,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
@@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
   // Convert the CINN runtime program to a Paddle graph
   runtime_graph_ = std::make_unique<framework::ir::Graph>(
       BuildCompiledProgram(graph, compiled_obj));
-  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
-      kMemOptVarInfoFromMainGraph,
-      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
+  auto& outer_varinfo = graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph,
+                                               &outer_varinfo);
+  // collect skip_eager_vars
+  skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size());
+  auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) {
+    // if a var exists at outer_varinfo map,
+    // that means it can be erased after graph execution
+    if (!outer_varinfo.count(var_name)) {
+      skip_eager_vars_.emplace_back(var_name);
+    }
+  };
+  std::for_each(input_var_names.begin(), input_var_names.end(),
+                add_skip_var_fn);
+  std::for_each(output_var_names.begin(), output_var_names.end(),
+                add_skip_var_fn);
+  VLOG(4) << string::Sprintf(
+      "Distribution of variables in the graph compiled:"
+      "input[%lu],internal[%lu],output[%lu],"
+      "outer_eager_deletion[%lu],skip_eager_deletion[%lu],"
+      "initialized_beforehand[%lu]",
+      input_var_names.size(), internal_var_names_.size(),
+      output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(),
+      initialized_beforehand_vars_.size());
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
   //   are set by values of the corresponding compiled tensors,
   //   including the in/out variables where the equiality between their tensors
   //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  std::unordered_set<std::string> has_refer_vars;
   for (auto&& arg : cinn_argument_names_) {
     const std::string& var_name = cinn2paddle_varmap_.at(arg);
     framework::VarDesc* var_desc = block->Var(var_name);
@@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
       auto* ori_desc = res->second;
       var_desc->SetPersistable(ori_desc->Persistable());
       var_desc->SetIsParameter(ori_desc->IsParameter());
+      has_refer_vars.insert(var_name);
     }
 
     auto cinn_tensor = GetCinnTensorOfVar(var_name);
@@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
     auto* ins = instructions.at(ins_idx).get();
     auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
     auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+    for (auto&& var_name : in_args) {
+      if (!has_refer_vars.count(var_name)) {
+        initialized_beforehand_vars_.emplace_back(var_name);
+      }
+    }
+    has_refer_vars.insert(out_args.begin(), out_args.end());
 
     auto* op_desc = block->AppendOp();
     op_desc->SetType("cinn_instruction_run");
@@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
                                                   framework::Scope* scope) {
   if (!parallel_executor_) {
     framework::details::ExecutionStrategy exec_strategy;
+    exec_strategy.num_threads_ = 1;
+    exec_strategy.use_device_ = platform::Place2DeviceType(place);
     framework::details::BuildStrategy build_strategy;
     parallel_executor_ = std::make_unique<ParallelExecutor>(
         place, scope, exec_strategy, build_strategy, runtime_graph_.get());
   }
 
   // update the scope bound to an OpHandle and rebuild temporary variables
+  VLOG(4) << "Reset scope and initialize temporary variables";
   std::unordered_map<Scope*, Scope*> scope_map = {
       {parallel_executor_->GetLocalScopes().front(), scope}};
   parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
   parallel_executor_->PrepareVariables(scope);
+  for (auto&& var_name : initialized_beforehand_vars_) {
+    auto* var = scope->GetVar(var_name);
+    auto* buffer = GetCinnBufferOfVar(var_name);
+    auto dim = framework::DDim(buffer->dims, buffer->dimensions);
+    var->GetMutable<LoDTensor>()->Resize(dim);
+    var->GetMutable<LoDTensor>()->mutable_data<float>(place);
+  }
   return parallel_executor_.get();
 }
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index a4d613ea618a886d99344a34ad80aa02e88c10e7..ed5e4383d83d23322860e3f554160013fd5532c9 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -86,6 +86,11 @@ class CinnLaunchContext {
   void CheckTensorEquivalent(const std::string& var_name,
                              const framework::LoDTensor& paddle_tensor);
 
+  // Return the name list of variables skipped eager deletion
+  const std::vector<std::string>& GetSkipEagerVars() const {
+    return skip_eager_vars_;
+  }
+
   // Return internal variable names list
   const std::unordered_set<std::string>& GetInternalVarNames() const {
     return internal_var_names_;
@@ -143,6 +148,9 @@ class CinnLaunchContext {
   std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
+  // TODO(CtfGo): remove this list after fixing batch_norm bug
+  // due to duplicate association in the same variable.
+  std::vector<std::string> initialized_beforehand_vars_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
@@ -150,6 +158,8 @@ class CinnLaunchContext {
   std::unique_ptr<framework::ir::Graph> runtime_graph_;
   // a ParallelExecutor to execute the runtime graph
   std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+  // the name list of skip_eager_vars in runtime
+  std::vector<std::string> skip_eager_vars_;
 
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index cf3b98c6679b80acad8da69c91addadb9f66ce44..5263aae03ed3f1ab6afa4eb9e6bd38f61858b397 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     details::DebugCinnCompiledResult(cinn_compiled_object);
 
     auto* launch_context = cinn_compiled_object.launch_context.get();
-    // Step 3. Prepare arguments needed for the compiled executable program.
-    launch_context->UpdateCapturedEnv(scope, place);
+    // Step 3. check the computational consistency of the subgraph
+    //         before and after the compilation
     // 3.1 Input variables: tensors of input variables have
     //     been initialized before graph compiled, just check the
     //     equiality between tensors of paddle and cinn.
@@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                                             *inputs_name2tensor.at(var_name));
     }
 
-    // 3.2 Output variables: the output variables will be initialized
-    //     and allocated buffer in callbacks which are defined in the
-    //     external_malloc/free interface of cinn_buffer_t
-    //     in their corresponding arguments.
-    // 3.3 Internal variables: A temporary scope is created in
-    //     UpdateCapturedEnv to keep the internal variables and
-    //     they are also initialized through callbacks
-
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
-    // Step 5. Launch CINN to execute the compiled executable program
-    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
-    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
+    // Step 5. use PE to execute the compiled CINN instructions
+    //         in nodes of the runtime graph
+    VLOG(4) << "Execute the runtime graph by PE";
+    framework::Scope& exec_scope = scope.NewScope();
+    auto* pe = launch_context->InitializePE(place, &exec_scope);
+    pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
     VLOG(4) << "CinnLaunchOp launch execution done.";
   }
 };
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index f5b6161ff3462cc1f12df7f59b4709bf19032df2..585f1caabed051134fd5ce7624c17b741b487ef0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <mutex>
 #include <random>
 #include <string>
+#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
@@ -25,9 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP(cinn_launch);
+USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
+DECLARE_double(eager_delete_tensor_gb);
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle::operators {
 
@@ -61,6 +70,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
     CompareOpResult<float>(scope.GetVar(test_op_out_name),
                            scope.GetVar(add_op_out_name));
   };
+  FLAGS_eager_delete_tensor_gb = -1;
 
   // CPU
   run_and_check_fn(platform::CPUPlace());
diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f29bc57c9a5f4dbbfd53220ce187b386b3025e55
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+#include "paddle/fluid/framework/convert_utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::DDim out_dims = x->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint32_t send_numel = x->numel();
+    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel,
+                                             dtype, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel<float>,
+                       ops::CAllGatherOpMLUKernel<uint8_t>,
+                       ops::CAllGatherOpMLUKernel<int>,
+                       ops::CAllGatherOpMLUKernel<int8_t>,
+                       ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index c0968581acda9950aaa8ee2b8f3af15e1db59a67..7206dd01bcaa3e588cc275c2fdf25e70aacc1663 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 31b00a93f1396564907a7872e919ba6c96f666d8..0946ad8aca65e28835ea1d139fb94c309ce840a1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 7e5120cd2b392b1eb0698727ccebac485193f6d9..2c4e85400ca4adadce5db1fd318ce2273caa201f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 9c11704704ed420b14a6ccd9873e0bfbe143b4fe..61e5f27903477972ef10465ccfd6f8de8ce8fba6 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d315f211709e4f76c2d5c685721961a91c2102fe..d1e269fb5a4fe9505acf7043bc7a2cea36823ffa 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
     cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 5787090e6a52f2f37bd504a904108cd1d24caf5f..cf4d6a28744b368212fe8bcb0924001aa53b5a4e 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c79b2f92b69a1e6cc5c6f1cf17fa402c671a1997..c4e410d04da5fb5e9b6bfe4d7d5c263084889f54 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index d9a7a4abb08fc883b9b9210fcdefd56af127263a..8b498787c69db0f978acaa68ba63883270e11eb4 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index b8abf458c1c6d395fef08238abaa114ff5dc6e9e..133085ad3f3b0ffd00dbf4d026687b0311116951 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index bb78971734bf05e94f7b0ebc1f1540b254f98067..36c6f4fadd0fcc9b06c61d5c45ce6829f2d3d977 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 8f7b8c4a9040be3a2b4540c693c128e92c06a180..6e02d362156970cdee7257c7d00b70cef0519757 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index c40b2c3e76a02ce6e5e754b2dc4280d6917145e7..57e3dd53cc7748fa0fb66e7e934a1c9cd764a15f 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 1da7798ea2696516759ac49b8ce459459e74066b..059fafa3e7f4d4ff0dac7541038d62e03865529f 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -205,8 +205,8 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
-                            PT_INFER_META(phi::ConcatInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PD_INFER_META(phi::ConcatInferMeta));
 
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                   ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 95135ba3b1a3db156cd80629296481470b11f937..cbec1182f20b886fb4a77847abf7213aec9990a5 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ops::ConjGradMaker<paddle::framework::OpDesc>,
                   ops::ConjGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index a974f2ec335487e0fbc12a578c0d80d6856e418e..0c18522fa32eae5f357da062fbd25fa92878cc08 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
     target_link_libraries(conditional_block_infer_op conditional_block_op)
 endif()
 
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
-file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n")
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc
index 55cab03ea9e3f18f36043848914ac11fac1027c9..4dcbbc8568ff18a1313171f8f66f276d77f019a1 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cc
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/bitwise_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -75,11 +75,19 @@ It operates ``%s`` on Tensor ``X`` .
   }
 };
 
-class BitwiseOp : public framework::OperatorWithKernel {
+template <typename OpComment>
+class UnaryBitwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
@@ -90,23 +98,9 @@ class BitwiseOp : public framework::OperatorWithKernel {
 };
 
 template <typename OpComment>
-class UnaryBitwiseOp : public BitwiseOp {
- public:
-  using BitwiseOp::BitwiseOp;
-
- protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OpComment comment;
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
-    context->SetOutputDim("Out", context->GetInputDim("X"));
-    context->ShareLoD("X", "Out");
-  }
-};
-
-template <typename OpComment>
-class BinaryBitwiseOp : public BitwiseOp {
+class BinaryBitwiseOp : public framework::OperatorWithKernel {
  public:
-  using BitwiseOp::BitwiseOp;
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext *context) const override {
@@ -130,6 +124,14 @@ class BinaryBitwiseOp : public BitwiseOp {
     }
     context->ShareLoD("X", "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // BitwiseOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
 };
 
 }  // namespace operators
@@ -167,8 +169,3 @@ REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y");
 REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y");
 REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y");
 REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X");
-
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor);
-REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor);
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu
deleted file mode 100644
index 5d98da2c027fb6ee681bbea3980f1dbf631d6431..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/bitwise_op.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/bitwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using T = typename Functor::ELEM_TYPE;
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins = {x, y};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(cuda_ctx, ins, &outs, -1,
-                                                      functor);
-  }
-};
-
-template <typename Functor>
-class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using T = typename Functor::ELEM_TYPE;
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-namespace plat = ::paddle::platform;
-
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor);
-REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor);
-REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor);
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h
deleted file mode 100644
index 9e652f92007479684fcf8ec5e539312d8d729107..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/bitwise_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
-  template <typename T>                                                      \
-  struct Bitwise##func##Functor {                                            \
-    using ELEM_TYPE = T;                                                     \
-    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
-  };                                                                         \
-                                                                             \
-  template <>                                                                \
-  struct Bitwise##func##Functor<bool> {                                      \
-    using ELEM_TYPE = bool;                                                  \
-    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
-      return a bool_expr b;                                                  \
-    }                                                                        \
-  };
-
-BITWISE_BINARY_FUNCTOR(And, &, &&)
-BITWISE_BINARY_FUNCTOR(Or, |, ||)
-BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
-#undef BITWISE_BINARY_FUNCTOR
-
-template <typename T>
-struct BitwiseNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE T operator()(const T a) const { return ~a; }
-};
-
-template <>
-struct BitwiseNotFunctor<bool> {
-  using ELEM_TYPE = bool;
-  HOSTDEVICE bool operator()(const bool a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryBitwiseOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto func = Functor();
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    ElementwiseComputeEx<Functor, DeviceContext, T>(context, x, y, -1, func,
-                                                    out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryBitwiseOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    auto func = Functor();
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), out->mutable_data<T>(context.GetPlace()),
-          func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-namespace plat = ::paddle::platform;
-
-#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor)                 \
-  REGISTER_OP_##dev##_KERNEL(                                                 \
-      op_type,                                                                \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<bool>>,    \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<uint8_t>>, \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int8_t>>,  \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int16_t>>, \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int>>,     \
-      ops::BinaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int64_t>>);
-
-#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor)                 \
-  REGISTER_OP_##dev##_KERNEL(                                                \
-      op_type,                                                               \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<bool>>,    \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<uint8_t>>, \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int8_t>>,  \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int16_t>>, \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int>>,     \
-      ops::UnaryBitwiseOpKernel<plat::dev##DeviceContext, functor<int64_t>>);
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index ede349f737d899e5f04cb5e35d1dbc0c0abc2403..dd407f4f6f3c51ef99cb09f08ef7fdca5b1e10bc 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -12,49 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_all_op.h"
-#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename Functor>
-class CompareReduceOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    Tensor tmp;
-    bool* z_data = z->mutable_data<bool>(context.GetPlace());
-
-    if (x->dims() != y->dims()) {
-      z_data[0] = false;
-    } else {
-      tmp.mutable_data<bool>(x->dims(), context.GetPlace());
-      if (x->numel() == 1 && y->numel() == 1) {
-        bool* z_data = tmp.mutable_data<bool>(context.GetPlace());
-        z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-      } else {
-        ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-            context, x, y, 0, Functor(), &tmp);
-      }
-      auto ipt = framework::EigenVector<bool>::Flatten(tmp);
-      auto out = framework::EigenScalar<bool>::From(*z);
-      auto& place =
-          *context.template device_context<platform::CPUDeviceContext>()
-               .eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      out.device(place) = ipt.all(reduce_dim);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -81,26 +46,6 @@ template <typename OpComment>
 class CompareReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    PADDLE_ENFORCE_EQ(context->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "%s operator must have input X", comment.type));
-    PADDLE_ENFORCE_EQ(context->HasInput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "%s operator must have input Y", comment.type));
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(
-        dim_x.size(), dim_y.size(),
-        platform::errors::InvalidArgument(
-            "The size of dim_y should not be greater than dim_x's."));
-
-    context->SetOutputDim("Out", {1});
-    context->ShareLoD("X", "Out");
-  }
 };
 
 }  // namespace operators
@@ -113,25 +58,13 @@ class CompareReduceOp : public framework::OperatorWithKernel {
   };                                                                       \
   char _##op_type##Comment::type[]{#op_type};                              \
   char _##op_type##Comment::equation[]{_equation};                         \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
+                              PD_INFER_META(phi::CompareAllInferMeta));    \
   REGISTER_OPERATOR(                                                       \
       op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>,  \
       ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,    \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,   \
+      op_type##_InferShapeFunctor);
 
-#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor)             \
-  REGISTER_OP_CPU_KERNEL(                                                \
-      op_type, ::paddle::operators::CompareReduceOpKernel<               \
-                   ::paddle::platform::CPUDeviceContext, functor<bool>>, \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<int>>,           \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<float>>,         \
-      ::paddle::operators::CompareReduceOpKernel<                        \
-          ::paddle::platform::CPUDeviceContext, functor<double>>);
 REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y");
-
-REGISTER_COMPARE_REDUCE_CPU_KERNEL(equal_all,
-                                   paddle::operators::EqualReduceFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
deleted file mode 100644
index d96dcebe51f97f1a3a954966aeb3663ff1f7a819..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/fill.h>
-#include "paddle/fluid/operators/controlflow/compare_all_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct BitwiseAdd {
-  // Bitwise add operator, returns <tt>a + b</tt>
-  inline T initial() { return static_cast<T>(true); }
-
-  __host__ __device__ __forceinline__ T operator()(const T& a,
-                                                   const T& b) const {
-    return a & b;
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class CompareReduceOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    bool* z_data = z->mutable_data<bool>(context.GetPlace());
-    Tensor tmp;
-
-    if (x->dims() != y->dims()) {
-      thrust::device_ptr<bool> z_dev_ptr(z_data);
-      thrust::fill(z_dev_ptr, z_dev_ptr + 1, false);
-      return;
-    } else {
-      tmp.mutable_data<bool>(x->dims(), context.GetPlace());
-      const auto& cuda_ctx =
-          context.template device_context<platform::CUDADeviceContext>();
-      std::vector<const framework::Tensor*> ins = {x, y};
-      std::vector<framework::Tensor*> outs = {&tmp};
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
-          cuda_ctx, ins, &outs, Functor());
-
-      // Reduce by 'bitwise and' operator
-      std::vector<int> reduce_dims;
-      reduce_dims.resize(tmp.dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-      auto stream = context.cuda_device_context().stream();
-      TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
-          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
-          reduce_dims, stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor)                  \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_type,                                                                 \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext, ops::functor<bool>>, \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext, ops::functor<int>>,  \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<int64_t>>,                       \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<float>>,                         \
-      ops::CompareReduceOpKernel<plat::CUDADeviceContext,                      \
-                                 ops::functor<double>>);
-
-REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor)
-#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h
deleted file mode 100644
index 78a7b76e3fd9d03f2381dfb13f90c191d1dca4f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_all_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <algorithm>
-#include <type_traits>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EqualReduceFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T a, const T b) const {
-    if (std::is_floating_point<T>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return fabs(static_cast<double>(a - b)) < 1e-8;
-    } else {
-      return (a == b);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 657e74398bb24bb4c2a5514bbb1656126591ee4e..72d81d8c3fdf2827da9b8362cee80ecbb16e4484 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include <algorithm>
-#include <string>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -60,31 +58,6 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
-    OpComment comment;
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type);
-    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type);
-    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Y");
-
-    if (context->GetInputDim("X") == context->GetInputDim("Y")) {
-      context->ShareDim("X", /*->*/ "Out");
-      context->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int max_dim = std::max(dim_x.size(), dim_y.size());
-      int axis = std::abs(dim_x.size() - dim_y.size());
-      std::vector<int> x_dims_array(max_dim);
-      std::vector<int> y_dims_array(max_dim);
-      std::vector<int> out_dims_array(max_dim);
-      GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
-                             y_dims_array.data(), out_dims_array.data(),
-                             max_dim, axis);
-      context->SetOutputDim("Out", phi::make_ddim(out_dims_array));
-      // to do
-      context->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
@@ -116,37 +89,31 @@ class CompareOp : public framework::OperatorWithKernel {
               "In order to force fill output variable to gpu memory.",     \
               false));
 
-#define REGISTER_COMPARE_OP(op_type, _equation)                           \
-  struct _##op_type##Comment {                                            \
-    static char type[];                                                   \
-    static char equation[];                                               \
-  };                                                                      \
-  char _##op_type##Comment::type[]{#op_type};                             \
-  char _##op_type##Comment::equation[]{_equation};                        \
-  REGISTER_OPERATOR(                                                      \
-      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,       \
-      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,      \
-      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); \
+#define REGISTER_COMPARE_OP(op_type, _equation)                          \
+  struct _##op_type##Comment {                                           \
+    static char type[];                                                  \
+    static char equation[];                                              \
+  };                                                                     \
+  char _##op_type##Comment::type[]{#op_type};                            \
+  char _##op_type##Comment::equation[]{_equation};                       \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
+                              PD_INFER_META(phi::CompareInferMeta));     \
+  REGISTER_OPERATOR(                                                     \
+      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,      \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,     \
+      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,  \
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, \
+      op_type##_InferShapeFunctor);                                      \
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterThanFunctor);
+
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterEqualFunctor);
+
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
-REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessThanFunctor);
+
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
-REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessEqualFunctor);
+
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
-                        paddle::operators::EqualFunctor);
+
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
-                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
deleted file mode 100644
index 4b9452d0f60e0396e4bc50bb5ea56e2f3131098e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor, typename InverseFunctor>
-class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  using InT = typename Functor::ELEM_TYPE;
-  using OutT = bool;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                   InT, OutT>(
-        cuda_ctx, ins, &outs, axis, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_type,                                                                 \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<bool>, void>,    \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int16_t>, void>, \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int>, void>,     \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int64_t>, void>, \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<float>, void>,   \
-      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<double>, void>);
-
-REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor)
-REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor)
-#undef REGISTER_CUDA_COMPARE_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
deleted file mode 100644
index be017a01ef3237fd8572e248d691daa97c999509..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define COMPARE_FUNCTOR(func_name, op)                           \
-  template <typename InT, typename OutT = bool>                  \
-  struct func_name {                                             \
-    using ELEM_TYPE = InT;                                       \
-    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
-      return static_cast<OutT>(a op b);                          \
-    }                                                            \
-  };
-
-COMPARE_FUNCTOR(LessThanFunctor, <)
-COMPARE_FUNCTOR(LessEqualFunctor, <=)
-COMPARE_FUNCTOR(GreaterThanFunctor, >)
-COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
-#undef COMPARE_FUNCTOR
-
-template <typename InT, typename OutT = bool>
-struct EqualFunctor {
-  using ELEM_TYPE = InT;
-  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
-    if (std::is_floating_point<InT>::value) {
-      // This branch will be optimized while compiling if T is integer. It is
-      // safe to cast a and b to double.
-      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
-    } else {
-      return static_cast<OutT>(a == b);
-    }
-  }
-};
-
-template <typename InT, typename OutT = bool>
-struct NotEqualFunctor {
-  using ELEM_TYPE = InT;
-  HOSTDEVICE bool operator()(const InT a, const InT b) const {
-    return !EqualFunctor<InT, OutT>()(a, b);
-  }
-};
-
-template <typename DeviceContext, typename Functor, typename InverseFunctor>
-class CompareOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                            Functor(), z);
-    } else {
-      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
-          context, x, y, axis, InverseFunctor(), z);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
-  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<bool>, inverse_functor<bool>>,       \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int>, inverse_functor<int>>,         \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int16_t>, inverse_functor<int16_t>>, \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<int64_t>, inverse_functor<int64_t>>, \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<float>, inverse_functor<float>>,     \
-                             ::paddle::operators::CompareOpKernel<            \
-                                 ::paddle::platform::dev##DeviceContext,      \
-                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
index 9dc287ab76a67c6026ec8794793e77179063af3d..c39743ef9914c039f13428d43a66b1aa66ada0ed 100644
--- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
index 7bc4ca09771355361d8106421dc57601b94c88f1..7377d7cf8d312c4f4f405235b21b372b1a7a738c 100644
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
index 698bd0516133861523f8d2b353abfeace4665840..2de8b4c9ba880e089bb4eaa4fa8df3bedb69b55b 100644
--- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 3bbb284ca821b8576f2752446555f146c16bb189..4e6fda3d09a071f59c97c87315619d126497a756 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                                const Tensor* input, Tensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *context.eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
   auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
 
 using framework::ConvSearchCache;
 
-static void SetConvMathType(const framework::ExecutionContext& ctx,
-                            cudnnDataType_t dtype,
+static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
                             const platform::ConvolutionDescriptor& cdesc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto& dev_ctx = ctx;
   if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
     bool has_got_workspace_size = true;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     } else if (deterministic) {
       algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     } else if (deterministic) {
       return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
       AlgorithmsCache<algo_t>& algo_cache =
@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
-                     const framework::ExecutionContext& ctx) {
+                     bool deterministic, const phi::GPUContext& ctx) {
     platform::CUDAGraphCaptureModeGuard guard;
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     } else if (deterministic) {
       return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
     } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx;
       auto workspace_handle = dev_ctx.cudnn_workspace_handle();
       AlgorithmsCache<algo_t>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetBackwardFilter());
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
deleted file mode 100644
index dff60afd74c02f458b5b3c7428c2703197b61af0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ /dev/null
@@ -1,1476 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the spopecific language governing permissions and
-limitations under the License. */
-
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/operators/conv_miopen_helper.h"
-#else
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-
-static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) {
-  return dev_ctx.GetComputeCapability() >= 70;
-}
-
-template <typename T>
-class CUDNNConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    // Tensor Core introduced from Volta GPUs supports more faster conv op
-    // with FP16 in NHWC data format.
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    // We will only do data format conversion from NHWC to NCHW.
-    // cudnn will convert NCHW to NHWC automatically on Tensor Core.
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-
-    // ------------ transformed tensor -----------
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_filter_channel(filter->type());
-    T* output_data = nullptr;
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, output,
-                                                           &transformed_output);
-
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output.ShareDataWith(*output);
-    }
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-    }
-    output_data = transformed_output.data<T>();
-
-    // update padding and dilation
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
-
-    Tensor transformed_input;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-
-      std::vector<int> input_pad(transformed_input_channel.dims().size() * 2,
-                                 0);
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_input,
-                  &transformed_filter_channel,
-                  &transformed_output,
-                  strides,
-                  padding_common,
-                  dilations,
-                  dtype};
-
-    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_format = GetCudnnTensorFormat(layout);
-
-    args.handle = handle;
-
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN need to set groups in cdesc in miopen_desc.h
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn(), groups);
-#else
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn());
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it manually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
-        args.cdesc.desc(), groups));
-    groups = 1;
-#endif
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN do not set groups in wdesc after set groups in cdesc
-    groups = 1;
-#endif
-    args.idesc.set(transformed_input, layout_format);
-    args.wdesc.set(transformed_filter_channel, layout_format, groups);
-    args.odesc.set(transformed_output, layout_format);
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    }
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size = 0;  // final workspace to allocate.
-// ------------------- cudnn conv algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t algo{};
-    using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-    workspace_size = search::GetWorkspaceSize(args);
-    algo = search::Find<T>(args, exhaustive_search, deterministic,
-                           workspace_size, ctx);
-#else
-    cudnnConvolutionFwdAlgo_t algo{};
-    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-    workspace_size = search::GetWorkspaceSize(args, algo);
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
-    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
-    // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
-    // FWD_ALGO_IMPLICIT_GEMM manually.
-    if (ctx.Attr<int>("groups") > 1) {
-      algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    }
-#endif
-
-    // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
-
-#ifdef PADDLE_WITH_HIP
-    workspace_handle.RunFunc(
-        [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionForward(
-                  handle, &alpha, args.idesc.desc(), input_data,
-                  args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
-                  &beta, args.odesc.desc(), output_data, workspace_ptr,
-                  workspace_size));
-        },
-        workspace_size);
-#else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::cudnnConvolutionForward(
-                    handle, &alpha, args.idesc.desc(),
-                    input_data + i * group_offset_in, args.wdesc.desc(),
-                    filter_data + i * group_offset_filter, args.cdesc.desc(),
-                    algo, workspace_ptr, workspace_size, &beta,
-                    args.odesc.desc(), output_data + i * group_offset_out));
-          },
-          workspace_size);
-    }
-#endif
-
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_output, output);
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvGradOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-
-    // transform Tensor
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output_grad_channel(output_grad->type());
-    Tensor transformed_input_grad_channel(input->type());
-    Tensor transformed_filter_channel(filter->type());
-    Tensor transformed_filter_grad_channel(filter->type());
-
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
-                 "NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-
-      if (input_grad) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, input_grad, &transformed_input_grad_channel);
-        // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
-        // the data of input_grad to transformed_input_grad_channel.
-        if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-          TransToChannelFirst<platform::CUDADeviceContext, T>(
-              ctx, input_grad, &transformed_input_grad_channel);
-        }
-      }
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output_grad_channel.ShareDataWith(*output_grad);
-      if (input_grad) {
-        transformed_input_grad_channel.ShareDataWith(*input_grad);
-      }
-    }
-
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-
-      if (filter_grad) {
-        ResizeToChannelLast<platform::CUDADeviceContext, T>(
-            ctx, filter_grad, &transformed_filter_grad_channel);
-      }
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-      if (filter_grad) {
-        transformed_filter_grad_channel.ShareDataWith(*filter_grad);
-      }
-    }
-
-    //  update paddings
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // cuDNN only supports padding the same amount on every dimension.
-    // So we create a new padded input tensor.
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_input(input->type());
-    Tensor transformed_input_grad(input->type());
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-
-      transformed_input_grad.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (input_grad) {
-        transformed_input_grad =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      // pad for input
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (input_grad) {
-        transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
-      }
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    const T* output_grad_data = transformed_output_grad_channel.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-    T* filter_grad_data = nullptr;
-    T* input_grad_data = nullptr;
-    T* transformed_input_grad_data = nullptr;
-
-    ConvArgs args1{&transformed_input_grad,
-                   &transformed_filter_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_input,
-                   &transformed_filter_grad_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-
-    auto handle = dev_ctx.cudnn_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    }
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-// ------------------- cudnn backward algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-    // input data workspace_size
-    size_t workspace_size_d = 0;
-    // weight workspace_size
-    size_t workspace_size_w = 0;
-    int iwo_groups = groups;
-    int c_groups = 1;
-
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    if (input_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      input_grad_data = input_grad->data<T>();
-      transformed_input_grad_data = transformed_input_grad.data<T>();
-      args1.handle = handle;
-      args1.idesc.set(transformed_input_grad, layout_tensor);
-      args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
-      args1.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args1.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-
-#ifdef PADDLE_WITH_HIP
-      using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size_d =
-          std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
-      data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
-                                   workspace_size_d, ctx);
-#else
-      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-      workspace_size_d = std::max(workspace_size_d,
-                                  search1::GetWorkspaceSize(args1, data_algo));
-#endif
-    }
-
-    if (filter_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      filter_grad_data = transformed_filter_grad_channel.data<T>();
-      args2.handle = handle;
-      args2.idesc.set(transformed_input, layout_tensor);
-      args2.wdesc.set(transformed_filter_grad_channel, layout_tensor,
-                      iwo_groups);
-      args2.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args2.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size_w =
-          std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
-      filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
-                                     workspace_size_w, ctx);
-#else
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
-      workspace_size_w = std::max(
-          workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
-#endif
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN ONLY support beta to be 0.0f
-    ScalingParamType<T> beta = 0.0f;
-#else
-    ScalingParamType<T> beta =
-        (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
-#endif
-    VLOG(4) << "Conv_grad: use_addto = "
-            << (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
-
-    if (input_grad) {
-// When beta is 0, it is unnecessary to reset input_grad.
-// When beta is 1, the output cannot be reset since addt strategy used.
-#ifdef PADDLE_WITH_HIP
-      if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-        Tensor temp_tensor(transformed_input_grad.type());
-        temp_tensor.Resize(transformed_input_grad.dims());
-        T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
-                      cudnn_workspace_ptr, workspace_size_d));
-            },
-            workspace_size_d);
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
-            handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
-            transformed_input_grad_data, &alpha, args1.idesc.desc(),
-            temp_tensor_data, &beta, args1.idesc.desc(),
-            transformed_input_grad_data));
-      } else {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data, cudnn_workspace_ptr,
-                      workspace_size_d));
-            },
-            workspace_size_d);
-      }
-
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args1.wdesc.desc(),
-                      filter_data + i * group_offset_filter, args1.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
-                      workspace_size_d, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data + i * group_offset_in));
-            },
-            workspace_size_d);
-      }
-#endif
-      if (!is_sys_pad) {
-        std::vector<int> starts(transformed_input_channel.dims().size(), 0);
-        std::vector<int> axes(transformed_input_channel.dims().size(), 0);
-
-        for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-
-        transformed_input_grad_channel.mutable_data(ctx.GetPlace());
-        if (transformed_input_channel.dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        }
-      }
-
-      if (channel_last && compute_format == DataLayout::kNCHW) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_input_grad_channel, input_grad);
-      }
-    }
-
-    // filter_grad do not use inplace addto.
-    ScalingParamType<T> beta_filter = 0.0f;
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-// Because beta is zero, it is unnecessary to reset filter_grad.
-#ifdef PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args2.odesc.desc(), output_grad_data,
-                    args2.idesc.desc(), input_data, args2.cdesc.desc(),
-                    filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
-                    cudnn_workspace_ptr, workspace_size_w));
-          },
-          workspace_size_w);
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args2.idesc.desc(),
-                      input_data + i * group_offset_in, args2.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size_w, &beta_filter, args2.wdesc.desc(),
-                      filter_grad_data + i * group_offset_filter));
-            },
-            workspace_size_w);
-      }
-#endif
-
-      if (compute_format == DataLayout::kNHWC) {
-        TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_filter_grad_channel, filter_grad);
-      }
-    }
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv(ddI, W) + conv(I, ddW)
- * dW = conv_bp_filter(ddI, dO)
- * dI = conv_bp_data(ddW, dO)
- */
-template <typename T>
-class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-    if (ddO) {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-      set_zero(dev_ctx, ddO, static_cast<T>(0));
-    }
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // const T* x = X->data<T>();
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-    T* transformed_dx = nullptr;
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensors to channel first-----------
-    Tensor transformed_X_channel(X->type());
-    Tensor transformed_dO_channel(dO->type());
-    Tensor transformed_ddX_channel(X->type());
-
-    Tensor transformed_ddO_channel(dO->type());
-    Tensor transformed_dX_channel(X->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-
-      if (ddX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-        TransToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-      }
-
-      if (ddO) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddO, &transformed_ddO_channel);
-      }
-      if (dX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, dX, &transformed_dX_channel);
-        transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
-      }
-
-    } else {
-      transformed_X_channel = *X;
-      transformed_dO_channel = *dO;
-      if (ddX) {
-        transformed_ddX_channel = *ddX;
-      }
-      if (ddO) {
-        transformed_ddO_channel.ShareDataWith(*ddO);
-      }
-      if (dX) {
-        transformed_dX_channel.ShareDataWith(*dX);
-      }
-    }
-
-    auto in_dims = transformed_X_channel.dims();
-    auto filter_dims = W->dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_X(X->type());
-    Tensor transformed_ddX(X->type());
-
-    Tensor transformed_dX(X->type());
-
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(X->dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_X_channel.dims()[0];
-      new_input_shape_vec[1] = transformed_X_channel.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            transformed_X_channel.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_X.Resize(new_input_shape);
-      transformed_ddX.Resize(new_input_shape);
-      transformed_dX.Resize(new_input_shape);
-
-      transformed_X =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (ddX) {
-        transformed_ddX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      if (dX) {
-        transformed_dX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-
-      // pad for input
-      const int rank = X->dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
-          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
-          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_X.ShareDataWith(transformed_X_channel);
-      if (ddX) {
-        transformed_ddX.ShareDataWith(transformed_ddX_channel);
-      }
-      if (dX) {
-        transformed_dX.ShareDataWith(transformed_dX_channel);
-      }
-
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* x = transformed_X.data<T>();
-
-    int iwo_group = groups;
-    int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-    groups = 1;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    ConvArgs args1{&transformed_ddX,
-                   W,
-                   &transformed_ddO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{
-        &transformed_X, ddW,  &transformed_ddO_channel, strides, padding_common,
-        dilations,      dtype};
-    ConvArgs args3{&transformed_ddX,
-                   dW,
-                   &transformed_dO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args4{
-        &transformed_dX, ddW,  &transformed_dO_channel, strides, padding_common,
-        dilations,       dtype};
-
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t fwd_algo1 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvFwdAlgorithm_t fwd_algo2 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionFwdAlgo_t fwd_algo1 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t fwd_algo2 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-
-    auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
-
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-
-    T* transformed_ddy_channel = nullptr;
-    if (ddO) {
-      ddy = ddO->data<T>();
-      transformed_ddy_channel = transformed_ddO_channel.data<T>();
-      if (ddX) {
-        args1.handle = handle;
-        args1.idesc.set(transformed_ddX, iwo_group);
-        args1.wdesc.set(*W, layout, iwo_group);
-        args1.odesc.set(transformed_ddO_channel, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-        using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size = search1::GetWorkspaceSize(args1);
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
-        workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
-#endif
-      }
-
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(transformed_X, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(transformed_ddO_channel, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-        using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size =
-            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, fwd_algo2));
-#endif
-      }
-    }
-
-    if (dW && ddX) {
-      dw = dW->data<T>();
-      args3.handle = handle;
-      args3.idesc.set(transformed_ddX, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-      args3.odesc.set(transformed_dO_channel, iwo_group);
-      args3.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
-                                     workspace_size, ctx);
-#else
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-#endif
-    }
-
-    if (ddW && dX) {
-      transformed_dx = transformed_dX.data<T>();
-
-      args4.handle = handle;
-      args4.idesc.set(transformed_dX, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(transformed_dO_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-
-#ifdef PADDLE_WITH_HIP
-      using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
-                                   workspace_size, ctx);
-#else
-      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-#endif
-    }
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h,
-             &i_w);
-
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-             &o_h, &o_w);
-
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = W->numel() / groups;
-
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
-    // 0.0f;
-    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    if (ddO) {
-      if (ddX) {
-        ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args1.idesc.desc(), ddx,
-                      args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
-                      &beta, args1.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args1.idesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        fwd_algo1, workspace_ptr, workspace_size, &beta,
-                        args1.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (ddW) {
-#ifdef PADDLE_WITH_HIP
-        // MIOPEN ONLY support beta to be 0.0f
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
-                      ddw, args2.cdesc.desc(), fwd_algo2, &beta,
-                      args2.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args2.idesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        fwd_algo2, workspace_ptr, workspace_size, &alpha,
-                        args2.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_ddO_channel, ddO);
-      }
-    }
-    T* transformed_dy_channel = transformed_dO_channel.data<T>();
-    if (dW && ddX) {
-      ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
-                    args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
-                    &beta, args3.wdesc.desc(), dw, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args3.idesc.desc(),
-                      ddx + i * group_offset_in, args3.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.cdesc.desc(), filter_algo, workspace_ptr,
-                      workspace_size, &beta, args3.wdesc.desc(),
-                      dw + i * group_offset_filter));
-            },
-            workspace_size);
-      }
-#endif
-    }
-
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardData(
-                    handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
-                    args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
-                    &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args4.wdesc.desc(),
-                      ddw + i * group_offset_filter, args4.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.cdesc.desc(), data_algo, workspace_ptr,
-                      workspace_size, &beta, args4.idesc.desc(),
-                      transformed_dx + i * group_offset_in));
-            },
-            workspace_size);
-      }
-#endif
-
-      if (!is_sys_pad) {
-        // reverse padded input
-        std::vector<int> starts(X->dims().size(), 0);
-        std::vector<int> axes(X->dims().size(), 0);
-
-        for (size_t i = 0; i < X->dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-        if (X->dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_dX_channel, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
-// Use depthwise_conv2d in MIOPEN to resolve this issue
-REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#else
-#if CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-#else
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
-
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 9c9795143eb78dc5c1b22ec792d8753f915c976e..66f718693847837a4d169a5cab9629a1f668244f 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                                const Tensor* input, Tensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *context.eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
   auto offsets = Eigen::array<int, D>();
@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
     algo_t algo;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
 
     int find_count;
     miopenConvAlgoPerf_t find_result;
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index e345a4d2603b630508e299207984f4708217a1d8..8213e877f722433488cd826bb63cba376972c57a 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
             paddle::framework::DataTypeToString(input_data_type),
             paddle::framework::DataTypeToString(filter_data_type)));
   }
-#ifndef PADDLE_WITH_ASCEND_CL
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(
-        library, framework::LibraryType::kCUDNN,
-        platform::errors::InvalidArgument(
-            "float16 can only be used when CUDNN or NPU is used"));
-  }
-#endif
+// #ifndef PADDLE_WITH_ASCEND_CL
+//   if (input_data_type == framework::proto::VarType::FP16) {
+//     PADDLE_ENFORCE_EQ(
+//         library, framework::LibraryType::kCUDNN,
+//         platform::errors::InvalidArgument(
+//             "float16 can only be used when CUDNN or NPU is used"));
+//   }
+// #endif
 #if PADDLE_WITH_CUDA
   if (input_data_type == framework::proto::VarType::BF16 &&
       library == framework::LibraryType::kCUDNN) {
@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
                   ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
 
-// depthwise conv kernel
-// TODO(xingzhaolong): neon kernel for mobile
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-
 REGISTER_OP_VERSION(conv2d)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc
deleted file mode 100644
index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/conv_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 26166362da8a2984dc3c0670b186b85800767fb7..a5d888765bf37d45d501a3dbe5437f7c2ab5fc51 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename DeviceContext, typename T>
-class GemmConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    const int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output(output->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-
-    // update padding and dilation
-    auto trans_in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims =
-        phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-    // filter_shape_vec:
-    // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-
-    // output_shape_vec:
-    // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output.dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec:
-    // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
-    // o_d,o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = trans_in_dims[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size:
-    // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
-    // o_w)
-
-    framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim);
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    framework::DDim in_matrix_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        transformed_output.dims()[1],
-        transformed_output.numel() /
-            (transformed_output.dims()[0] * transformed_output.dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
-
-    math::Vol2ColFunctor<DeviceContext, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch =
-          transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
-      Tensor out_batch =
-          transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx, in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &col);
-
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
-                    T(0.0));
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output_grad(output_grad->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-
-    // update padding and dilation
-    auto in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output_grad.dims()));
-
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = transformed_input.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w)
-    // or
-    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    framework::DDim input_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {
-        transformed_output_grad.dims()[1],
-        transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
-                                           transformed_output_grad.dims()[1])};
-
-    // convolution backward input operator:  gemm + col2im(or col2vol)
-    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary,
-      // because math::matmul will reset input_grad.
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col_matrix.ShareDataWith(in_grad_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
-                      &col_matrix, T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &in_grad_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          if (!is_expand) {
-            col.ShareDataWith(in_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, in_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
-                      &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
-    const Tensor* X = ctx.Input<Tensor>("Input");
-    const Tensor* dY = ctx.Input<Tensor>("DOutput");
-    const Tensor* ddX = ctx.Input<Tensor>("DDInput");
-    const Tensor* ddW_in = ctx.Input<Tensor>("DDFilter");
-
-    Tensor* ddY = ctx.Output<Tensor>("DDOutput");
-    Tensor* dW = ctx.Output<Tensor>("DFilter");
-    Tensor* dX = ctx.Output<Tensor>("DInput");
-    Tensor W = GET_DATA_SAFELY(ctx.Input<Tensor>("Filter"), "Input", "Filter",
-                               "GemmConvDoubleGrad");
-    if (!ddY && !dW && !dX) return;
-
-    const int groups = ctx.Attr<int>("groups");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensor
-    Tensor transformed_X(X->dtype());
-    Tensor transformed_dY(dY->dtype());
-    Tensor transformed_ddX(X->dtype());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-      TransToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-
-      ResizeToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-      TransToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-
-      if (ddX) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-        TransToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-      }
-    } else {
-      transformed_X = *X;
-      transformed_dY = *dY;
-      if (ddX) {
-        transformed_ddX = *ddX;
-      }
-    }
-
-    // update padding and dilation
-    auto in_dims = transformed_X.dims();
-    auto filter_dims = W.dims();
-
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(W.dims()));
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_dY.dims()));
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    // col_shape [in_channel/group, kh, kw, oh, ow]
-    col_shape_vec[0] = transformed_X.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-    // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-    // input_shape [Cin, H, W]
-    framework::DDim input_shape =
-        phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
-    // filter_matrix_shape [Cout, Cin * kh * kw]
-    framework::DDim filter_matrix_shape = {W.dims()[0],
-                                           W.numel() / W.dims()[0]};
-
-    W.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        transformed_dY.dims()[1],
-        transformed_dY.numel() /
-            (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
-    int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
-
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    // dx convolution double grad:  gemm + col2im(col2vol)
-    // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-    // oH, oW)
-    if (dX && ddW_in) {
-      Tensor ddW;
-      ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-      dX->mutable_data<T>(ctx.GetPlace());
-
-      Tensor transformed_dX(dX->dtype());
-
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, dX, &transformed_dX);
-
-      } else {
-        transformed_dX = *dX;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary
-      // because math::matmul will reset dx
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col_matrix.ShareDataWith(dx_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix,
-                      T(0.0));
-
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &dx_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_dX, dX);
-      }
-    }
-
-    // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-    // oH, oW)
-    // dw convolution double grad:  im2col(vol2col) + gemm
-    if (dW && ddX) {
-      dW->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dW, static_cast<T>(0));
-      Tensor dW_arr = *dW;
-      dW_arr.Resize(filter_matrix_shape);
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; ++g) {
-          // im2col
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, ddx_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-
-          Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice,
-                      T(1.0));
-        }
-      }
-    }
-
-    // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-    // w/ddw(Cout, Cin, kh, kw)
-    // ddy convolution double grad: im2col(vol2col) + gemm
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-
-      Tensor transformed_ddY(ddY->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddY, &transformed_ddY);
-      } else {
-        transformed_ddY = *ddY;
-      }
-
-      set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor ddy_batch =
-            transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
-        for (int g = 0; g < groups; ++g) {
-          // gemm
-          Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-
-          if (ddX) {
-            Tensor ddx_batch =
-                transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-            Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-            if (!is_expand) {
-              col.ShareDataWith(ddx_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, ddx_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-            }
-            Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(0.0));
-          }
-
-          if (ddW_in) {
-            Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape);
-            Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-
-            Tensor ddW;
-            ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-            if (!is_expand) {
-              col.ShareDataWith(x_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, x_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-            }
-
-            // gemm
-            Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(1.0));
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_ddY, ddY);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    if (channel_last) {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[output->dims().size() - 1] %
-              input->dims()[input->dims().size() - 1],
-          0, platform::errors::InvalidArgument(
-                 "ShapeError: The output channels must be a multiple of the "
-                 "input channels. But receivced output channel number is %d "
-                 "and input channel number is %d",
-                 output->dims()[output->dims().size() - 1],
-                 input->dims()[input->dims().size() - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[1] % input->dims()[1], 0,
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be a multiple of the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[1], input->dims()[1]));
-    }
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (fuse_relu) {
-      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    } else {
-      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-      if (fuse_relu) {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      } else {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      if (fuse_relu) {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      } else {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 4b8f9d7e6ca8d2f1dae99f1d034c53daf948f922..1841b78af32dd95d6884d5eb78ad30322ba7723e 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
     Tensor transformed_input;
@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
         } break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
@@ -242,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-    algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic, workspace_size,
+        ctx.template device_context<platform::CUDADeviceContext>());
 #else
     using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(args, false, deterministic, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic,
+        ctx.template device_context<platform::CUDADeviceContext>());
     workspace_size =
         std::max(workspace_size, search::GetWorkspaceSize(args, algo));
 #endif
@@ -375,7 +381,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
     Tensor transformed_output_grad;
@@ -407,13 +413,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
               &transformed_output_grad);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
               &transformed_output_grad);
         } break;
         default:
@@ -499,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1));
-      data_algo =
-          search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(args1, false, deterministic, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
 #endif
@@ -521,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      filter_algo =
-          search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size = std::max(workspace_size,
                                 search2::GetWorkspaceSize(args2, filter_algo));
 #endif
@@ -735,7 +747,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
     Tensor transformed_X(X->type());
     Tensor transformed_ddX(X->type());
 
@@ -794,26 +806,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (dO) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_dO_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_dO_channel, pad_value,
                 &transformed_dO);
           }
 
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
           if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                 &transformed_ddX);
           }
         } break;
@@ -940,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size = search1::GetWorkspaceSize(args1);
-        bwd_algo1 =
-            search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
         using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
         workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
 #endif
       }
@@ -961,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size =
             std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        bwd_algo2 =
-            search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
         using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
         workspace_size = std::max(workspace_size,
                                   search2::GetWorkspaceSize(args2, bwd_algo2));
 #endif
@@ -986,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo =
-          search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size = std::max(workspace_size,
                                 search3::GetWorkspaceSize(args3, filter_algo));
 #endif
@@ -1009,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo =
-          search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
       using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search4::Find<T>(args4, false, deterministic, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
 #endif
diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu
index b2a4910222f1178d23e94eade9580248bb103c88..054cb4b33895b02a816cc2bff82b1c9052bc645d 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu
+++ b/paddle/fluid/operators/conv_transpose_op.cu
@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
 
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(
+        groups, filter.dims()[0],
+        platform::errors::InvalidArgument(
+            "groups should be error to the 1st dimension of filter. But "
+            "received groups is %d and filter dimension[0] is %d",
+            groups, filter.dims()[0]));
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
+                                  "dilations should be 1 in depthwise conv. "
+                                  "But received dilations is %d",
+                                  v));
+    }
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *output, filter, *input, strides,
+        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+        dilations, output, data_layout);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
+      depthwiseConv(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, filter, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, input_grad, data_layout);
+    }
+
+    if (filter_grad) {
+      phi::funcs::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, *input, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, filter_grad, data_layout);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 // conv2d
 REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
                         ops::GemmConvTransposeKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 76d6ad6bf2ff7361a90fb6f013f989db5a2b8845..ee0fb7ab3683364f6db3cffd7ddef67c61f19433 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(
-        groups, filter.dims()[0],
-        platform::errors::InvalidArgument(
-            "groups should be error to the 1st dimension of filter. But "
-            "received groups is %d and filter dimension[0] is %d",
-            groups, filter.dims()[0]));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
-                                  "dilations should be 1 in depthwise conv. "
-                                  "But received dilations is %d",
-                                  v));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(
-        dev_ctx, *output, filter, *input, strides,
-        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-        dilations, output, data_layout);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    if (input_grad) {
-      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(
-          dev_ctx, *output_grad, filter, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, input_grad, data_layout);
-    }
-
-    if (filter_grad) {
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-
-      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(
-          dev_ctx, *output_grad, *input, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, filter_grad, data_layout);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index fe00ee06603f0ecf2e3fa6ac367303a70702508f..674b75625d1983ba97f3d47ee154beff79c42dad 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
-                            PT_INFER_META(phi::CrossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
+                            PD_INFER_META(phi::CrossInferMeta));
 REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker,
                   ops::CrossGradMaker<paddle::framework::OpDesc>,
                   ops::CrossGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
deleted file mode 100644
index ab3860ecafc3569c13b0b9e5c882df9ddc03e190..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cum_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <array>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename Functor>
-class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = GET_DATA_SAFELY(context.Input<framework::Tensor>("X"), "Input",
-                              "X", "Cum");
-
-    auto& Out = GET_DATA_SAFELY(context.Output<framework::Tensor>("Out"),
-                                "Output", "Out", "Cum");
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto out_dims = Out.dims();
-
-    PADDLE_ENFORCE_EQ(
-        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(axis) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-            out_dims.size(), out_dims.size() - 1, axis));
-    if (axis < 0) {
-      axis += out_dims.size();
-    }
-
-    Out.template mutable_data<T>(context.GetPlace());
-
-    int pre = 1;
-    int post = 1;
-    int mid = out_dims[axis];
-    for (int i = 0; i < axis; ++i) {
-      pre *= out_dims[i];
-    }
-    for (int i = axis + 1; i < out_dims.size(); ++i) {
-      post *= out_dims[i];
-    }
-
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    using IndexT = Eigen::DenseIndex;
-    if (pre == 1) {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
-                   /* axis= */ 0, reverse, exclusive);
-      }
-    } else {
-      if (post == 1) {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      } else {
-        ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
-                   /* axis= */ 1, reverse, exclusive);
-      }
-    }
-  }
-
- private:
-  template <typename Device, typename Dim, typename X, typename Out>
-  void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
-                  bool reverse, bool exclusive) const {
-    if (!reverse) {
-      out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
-    } else {
-      std::array<bool, Dim::count> rev;
-      rev.fill(false);
-      rev[axis] = reverse;
-      out.reshape(dims).device(d) =
-          Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
-    }
-  }
-};
-
-template <typename T>
-struct CumsumFunctor {
-  using ELEMENT_TYPE = T;
-  template <typename X>
-  const typename X::TensorScanSumOp operator()(X x, int axis,
-                                               bool exclusive) const {
-    return x.cumsum(axis, exclusive);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc
index bff6673429d9a4088c65f9dc02c1546f23d96878..90910bbbb2050bad85d10e0467a099c42030c084 100644
--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/cumprod_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -87,16 +88,3 @@ REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
                   ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod, ops::CumprodOpCPUKernel<float>, ops::CumprodOpCPUKernel<double>,
-    ops::CumprodOpCPUKernel<int>, ops::CumprodOpCPUKernel<int64_t>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCPUKernel<float>,
-    ops::CumprodGradOpCPUKernel<double>, ops::CumprodGradOpCPUKernel<int>,
-    ops::CumprodGradOpCPUKernel<int64_t>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu
deleted file mode 100644
index f792d6832917f52573dce7ee3e449c2f4be63584..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumprod_op.cu
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/transform.h>
-#include "paddle/fluid/operators/cumprod_op.h"
-#include "paddle/fluid/operators/math/inclusive_scan.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MultiplyFunctor {
-  HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename T>
-class CumprodOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    auto *y = ctx.Output<framework::Tensor>("Out");
-    auto dim = ctx.Attr<int>("dim");
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-
-    const auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast<T>(1),
-        MultiplyFunctor<T>(), /*reverse=*/false, dev_ctx);
-  }
-};
-
-template <typename T>
-struct IsZeroFunctor {
-  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
-};
-
-template <typename T>
-struct CumprodGradFunctorExceptFirstZero {
-  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
-      const T *x, const T *y, const T *dy_mul_y_reversed_cumsum,
-      const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx,
-      int64_t *first_zero_idx, T *x_filled_one)
-      : x_(x),
-        y_(y),
-        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
-        zero_mask_(zero_mask),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx),
-        first_zero_idx_(first_zero_idx),
-        x_filled_one_(x_filled_one) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto inner_idx = idx % inner_dim_;
-    auto outer_idx = idx / (mid_dim_ * inner_dim_);
-    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
-    auto mask = zero_mask_[idx];
-    bool should_fill_one = true;
-
-    if (mask == 0) {
-      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
-      if (mid_idx == mid_dim_ - 1) {
-        // record first zero position as -1, i.e., no zero
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
-      }
-    } else if (mid_idx > 0) {                  // mask > 0
-      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
-        dx_[idx] = 0;
-        should_fill_one = false;
-      } else {
-        // idx is the first zero position, it should be recorded
-        dx_[idx] = y_[idx - inner_dim_];
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
-      }
-    } else {  // the first zero position is index 0
-      dx_[idx] = 1;
-      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
-    }
-
-    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  const T *dy_mul_y_reversed_cumsum_;
-  const uint8_t *zero_mask_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-  int64_t *first_zero_idx_;
-  T *x_filled_one_;
-};
-
-template <typename T>
-struct FillFirstZeroPositionGradFunctor {
-  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
-                                              const T *grad_value,
-                                              size_t mid_dim, size_t inner_dim,
-                                              T *dx)
-      : first_zero_idx_(first_zero_idx),
-        grad_value_(grad_value),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto outer_idx = idx / inner_dim_;
-    auto inner_idx = idx % inner_dim_;
-    auto mid_idx = first_zero_idx_[idx];
-    if (mid_idx >= 0) {
-      auto full_idx =
-          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
-      dx_[full_idx] *= grad_value_[full_idx];
-    }
-  }
-
- private:
-  const int64_t *first_zero_idx_;
-  const T *grad_value_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-};
-
-/*
-Reference to
-https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
-input: x, y, dL/dy
-output: dL/dx
-dL/dx[i] = sum{0<=j<n} (dL/dy[j])*(dy[j]/dx[i]) (1)
-         = sum(0<=j<n} (dL/dy[j])*(d(x[0]*x[1]*...*x[j])/dx[i])
-if x[i] != 0, dL/dx[i] = sum{i<=j<n} (dL/dy[j])*(y[j]/x[i]) (2)
-if x[i] == 0, the formula(2) can not be applied directly.
-Suppose k is the first index of zero element, the formula will be:
-i > k, dL/dx[i] = 0;
-i < k, dL/dx[i] = 1/x[i]*sum{i<=j<n} (dL/dy[j]*y[j])
-i = k, dL/dx[i] = y[i-1]*sum{i<=j<n} (dL/dy[j])*(x[i+1]*...*x[j])
-
-First, we will show the main resolution.
-We need to judge the relationship between i (current index) and k (index
-which corresponds to the first element of 0).
-To mark the relationship, we now introduce zero_mask and we also need to
-mark the index of the first zero element.
-zero_mask = cummax(x[i] == 0);      //label whether x[i]==0 until the index.
-zero_index = -1;                    //store the first zero element's index.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     zero_mask = [0, 0, 0, 1, 1, 1, 1];
-     zero_index = 3;
-When i < k, we need to calculate the result of sum{i<=j<n}(d_y[j]*y[j]), we can
-use reversed cumsum to calculate it.
-R = reversed_cumsum(dy[j]*y[j]);     //store the calculation result of the
-sum{i<=j<n}(d_y[j]*y[j]) and x[k+1],x[k+2],...,x[j] along the index k+1 ~ j.
-When i = k, we need to calculate the result of prod{i<w<j}(x[w]).
-To calculate it, we introduce x_filled_one, which fill 1 before x[k+1] along
-the index 0 ~ k.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     x_filled_one = [1, 1, 1, 1, 2, 3, 0];
-Thus, we can use cumprod(x_filled_one[j]) to calculate the result of
-prod{k<=w<j}(x[w]).
-
-Then, we will show more detailed implementation.
-for (int i = 0; i < numel; i++) {
-    if (zero_mask[i] == 0) {       //case i < k
-        dx[i] = R[i] / x[i];
-        x_filled_one[i] = 1;
-    } else {
-        if (i == 0) {              //case i = k
-            dx[i] = 1;
-            zero_index = i;
-            x_filled_one[i] = 1;
-        } else {
-            if (zero_mask[i-1] == 0) {    //case i = k
-                dx[i] = y[i-1];
-                zero_index = i;
-                x_filled_one[i] = 1;
-            } else {                  //case i > k
-                dx[i] = 0;
-                x_filled_one[i] = x[i];
-            }
-        }
-    }
-}
-T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j]));
-if (zero_index != -1) {
-    dx[zero_index] *= T[zero_index];
-}
-*/
-
-template <typename T>
-class CumprodGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    const auto *y = ctx.Input<framework::Tensor>("Out");
-    const auto *dy =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto dim = ctx.Attr<int>("dim");
-
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-    if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
-
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    const auto *x_data = x->data<T>();
-    const auto *y_data = y->data<T>();
-    const auto *dy_data = dy->data<T>();
-
-    auto place = ctx.GetPlace();
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    auto *dx_data = dx->mutable_data<T>(place);
-
-    // deal with complex
-    const T *x_data_deal;
-    const T *y_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr y_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
-      y_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_x(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_y(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
-      for_range_y(functor_y);
-      x_data_deal = x_data_conj;
-      y_data_deal = y_data_conj;
-    } else {
-      x_data_deal = x_data;
-      y_data_deal = y_data;
-    }
-
-// Step 1: find cummax-ed zero mask of x
-#ifdef PADDLE_WITH_CUDA
-    const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
-#else
-    const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
-#endif
-    auto zero_mask_without_cummax =
-        memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_without_cummax_data =
-        reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
-    thrust::transform(
-        exec_policy, thrust::device_pointer_cast(x_data_deal),
-        thrust::device_pointer_cast(x_data_deal) + numel,
-        thrust::device_pointer_cast(zero_mask_without_cummax_data),
-        IsZeroFunctor<T>());
-
-    auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
-    math::InclusiveScan<uint8_t, cub::Max>(
-        zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim,
-        inner_dim, static_cast<uint8_t>(0), cub::Max(), /*reverse=*/false,
-        dev_ctx);
-    zero_mask_without_cummax = nullptr;
-
-    // Step 2: calculate reversed cumsum(dy * y)
-    auto dy_mul_y = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(y_data_deal),
-                      thrust::device_pointer_cast(dy_mul_y_data),
-                      MultiplyFunctor<T>());
-
-    auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_reversed_cumsum_data =
-        reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(), /*reverse=*/true, dev_ctx);
-
-    // Step 3: calculate the gradient value except the first zero position.
-    // The gradient value of the first zero position is filled with out[idx-1],
-    // while the gradient value of the other positions are calculated out
-    // completely. This functor also:
-    //  (1) find the first zero index, i.e., first_zero_idx_data.
-    //  (2) fill x_filled_one, which satifies
-    //      x_filled_one[i] = x[i], i > pos
-    //      x_filled_one[i] = 1, i <= pos
-    auto first_zero_idx =
-        memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t));
-    auto *first_zero_idx_data =
-        reinterpret_cast<int64_t *>(first_zero_idx->ptr());
-    auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, numel);
-    CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
-        x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data,
-        mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data);
-    for_range(functor_except_first_zero);
-
-    // Step 4: calculate cumprod of x_filled_one
-    auto *x_filled_one_cumprod_data =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(1), MultiplyFunctor<T>(), /*reverse=*/false,
-        dev_ctx);
-
-    // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
-    auto *dy_mul_x_filled_one_cumprod =
-        dy_mul_y_data;  // reuse former allocated memory
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(x_filled_one_cumprod_data),
-                      thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
-                      MultiplyFunctor<T>());
-    auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_x_filled_one_cumprod,
-        dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(),
-        /*reverse=*/true, dev_ctx);
-
-    // Step 6: fill zero pos gradient value
-    platform::ForRange<platform::CUDADeviceContext>
-        for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim);
-    FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
-        first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum,
-        mid_dim, inner_dim, dx_data);
-    for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod, ops::CumprodOpCUDAKernel<float>, ops::CumprodOpCUDAKernel<double>,
-    ops::CumprodOpCUDAKernel<int>, ops::CumprodOpCUDAKernel<int64_t>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCUDAKernel<float>,
-    ops::CumprodGradOpCUDAKernel<double>, ops::CumprodGradOpCUDAKernel<int>,
-    ops::CumprodGradOpCUDAKernel<int64_t>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h
deleted file mode 100644
index 74ed2008ae98380388d874529264c6b6c0b5a49a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumprod_op.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim,
-                              size_t* outer_dim, size_t* mid_dim,
-                              size_t* inner_dim) {
-  PADDLE_ENFORCE_GE(
-      cumprod_dim, -dim.size(),
-      platform::errors::InvalidArgument(
-          "The input dim of CumprodOp should be larger than the opposite "
-          "rank of input x which is %d.But received dim=%d",
-          -dim.size(), cumprod_dim));
-  PADDLE_ENFORCE_LT(cumprod_dim, dim.size(),
-                    platform::errors::InvalidArgument(
-                        "The input dim of CumprodOp should be smaller than the "
-                        "rank of input x which is %d.But received dim=%d",
-                        dim.size(), cumprod_dim));
-  if (cumprod_dim < 0) cumprod_dim += dim.size();
-
-  *outer_dim = 1;
-  for (int i = 0; i < cumprod_dim; ++i) {
-    *outer_dim *= dim[i];
-  }
-  *mid_dim = dim[cumprod_dim];
-  *inner_dim = 1;
-  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
-    *inner_dim *= dim[i];
-  }
-}
-
-template <typename T>
-class CumprodOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int dim = context.Attr<int>("dim");
-
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    framework::DDim shape = x->dims();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t j = 0; j < mid_dim; j++) {
-        for (size_t k = 0; k < inner_dim; k++) {
-          size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
-          if (j == 0) {
-            out_data[pos] = x_data[pos];
-          } else {
-            out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class CumprodGradOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const {
-    const Tensor* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    framework::DDim shape = x->dims();
-    Tensor* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-
-    auto* d_out_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->data<T>();
-    auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    // deal with complex
-    const T* x_data_deal;
-    const T* out_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr out_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
-      out_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_x(dev_ctx,
-                                                                 numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_out(dev_ctx,
-                                                                   numel);
-      phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
-      for_range_out(functor_out);
-
-      x_data_deal = x_data_conj;
-      out_data_deal = out_data_conj;
-    } else {
-      x_data_deal = x_data;
-      out_data_deal = out_data;
-    }
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t k = 0; k < inner_dim; k++) {
-        for (size_t j = 0; j < mid_dim; j++) {
-          size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
-          d_x_data[index] = 0;
-          for (size_t n = 0; n < mid_dim; n++) {
-            size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
-            T elem;
-            if (j == 0) {
-              elem = d_out_data[pos];
-            } else {
-              elem = d_out_data[pos] * out_data_deal[index - inner_dim];
-            }
-            if (pos > index) {
-              for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
-                elem *= x_data_deal[m];
-              }
-            } else if (pos < index) {
-              elem = static_cast<T>(0);
-            }
-            d_x_data[index] += elem;
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 9fa355a924612651556f2a79711cae4ce17379f8..11633fb0b870327f14e4454b3f94a43940a9df53 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/cum_op.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,17 +24,6 @@ namespace operators {
 class CumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->Attrs().Get<bool>("flatten")) {
-      ctx->SetOutputDim("Out",
-                        phi::make_ddim({phi::product(ctx->GetInputDim("X"))}));
-    } else {
-      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    }
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,15 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-
+DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor,
+                            PD_INFER_META(phi::CumsumInferMeta));
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
-                  ops::CumsumGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int16_t>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int64_t>>);
+                  ops::CumsumGradMaker<paddle::imperative::OpBase>,
+                  CumsumInferShapeFunctor);
 
 REGISTER_OP_VERSION(cumsum)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
deleted file mode 100644
index 3402f42521f54f315390fe2162309fb204fd9b00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cumsum_op.cu
+++ /dev/null
@@ -1,325 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/reverse.h>
-#include <thrust/scan.h>
-#ifdef __NVCC__
-#include <cub/cub.cuh>
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-using Tensor = paddle::framework::Tensor;
-using LoDTensor = paddle::framework::LoDTensor;
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int BLOCK_SIZE>
-__device__ void BlockReverse(const T* idata, T* odata, int src_base,
-                             int dst_base, int valid_item) {
-  __shared__ T sh_mem[BLOCK_SIZE];
-  int tx = threadIdx.x;
-
-  int offset = tx;
-  int in_index = src_base + offset;
-  if (offset >= valid_item) {
-    sh_mem[offset] = 0;
-  } else {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    T data = idata[in_index];
-    sh_mem[sh_mem_index] = data;
-  }
-
-  __syncthreads();
-  int out_index = dst_base - offset;
-  if (offset < valid_item) {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    odata[out_index] = sh_mem[sh_mem_index];
-  }
-}
-
-template <typename T>
-__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data,
-                                 int reverse_size, int outer_size,
-                                 int inner_size) {
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-  int item_per_block = 1024;
-
-  for (int block_offset = 0; block_offset < reverse_size;
-       block_offset += item_per_block) {
-    int valid_item = (reverse_size - block_offset > item_per_block)
-                         ? item_per_block
-                         : reverse_size - block_offset;
-    int src_offset =
-        bx * reverse_size + block_offset + by * (inner_size * reverse_size);
-    int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) +
-                     reverse_size - 1 - block_offset;
-    if (reverse_size < item_per_block) {
-      valid_item = reverse_size;
-    }
-
-    BlockReverse<T, 1024>(matrix_data, reverse_data, src_offset, dst_offset,
-                          valid_item);
-  }
-}
-
-template <typename T>
-struct BlockPrefixCallbackOp {
-  // Running prefix
-  T running_total;
-  // Constructor
-  __device__ BlockPrefixCallbackOp(T running_total)
-      : running_total(running_total) {}
-  // Callback operator to be entered by the first warp of threads in the block.
-  // Thread-0 is responsible for returning a value for seeding the block-wide
-  // scan.
-  __device__ T operator()(T block_aggregate) {
-    T old_prefix = running_total;
-    running_total = old_prefix + block_aggregate;
-    return old_prefix;
-  }
-};
-
-// No bank-conflict transpose
-template <typename T, int TILE_DIM, int BLOCK_ROWS>
-__global__ void MatrixTranspose(T* odata, const T* idata, size_t height,
-                                size_t width) {
-  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
-
-  int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
-    if (x < width && (y + j) < height) {
-      tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x];
-    } else {
-      tile[threadIdx.y + j][threadIdx.x] = 0;
-    }
-  }
-
-  __syncthreads();
-
-  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
-  y = blockIdx.x * TILE_DIM + threadIdx.y;
-
-  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
-    if (x < height && (y + j) < width) {
-      odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
-    }
-  }
-}
-
-template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
-__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size,
-                                int outer_size, int scan_size, bool exclusive) {
-  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
-  typedef cub::BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD,
-                         cub::BLOCK_LOAD_TRANSPOSE>
-      BlockLoadT;
-  typedef cub::BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD,
-                          cub::BLOCK_STORE_TRANSPOSE>
-      BlockStoreT;
-  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
-  // Allocate type-safe, repurposable shared memory for collectives
-  __shared__ union {
-    typename BlockLoadT::TempStorage load;
-    typename BlockStoreT::TempStorage store;
-    typename BlockScanT::TempStorage scan;
-  } temp_storage;
-
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-
-  BlockPrefixCallbackOp<T> prefix_op(0);
-  T block_aggregate = static_cast<T>(0);
-
-  // Obtain this block's segment of consecutive keys (blocked across threads)
-  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
-  for (int block_offset = 0; block_offset < scan_size;
-       block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) {
-    int valid_item = (scan_size - block_offset > item_per_block)
-                         ? item_per_block
-                         : (scan_size - block_offset);
-    if (scan_size < item_per_block) {
-      valid_item = scan_size;
-    }
-
-    int offset = bx * scan_size + block_offset + by * (inner_size * scan_size);
-
-    T thread_keys[ITEMS_PER_THREAD];
-    BlockLoadT(temp_storage.load)
-        .Load(d_in + offset, thread_keys, valid_item, 0);
-
-    __syncthreads();
-    if (exclusive) {
-      T init_value = static_cast<T>(0);
-      BlockScanT(temp_storage.scan)
-          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
-    } else {
-      BlockScanT(temp_storage.scan)
-          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
-    }
-    __syncthreads();
-
-    BlockStoreT(temp_storage.store)
-        .Store(d_out + offset, thread_keys, valid_item);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CumCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    int axis = context.Attr<int>("axis");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool reverse = context.Attr<bool>("reverse");
-    auto out_dims = out->dims();
-    auto size = in->numel();
-
-    PADDLE_ENFORCE_EQ(
-        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(axis) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-            out_dims.size(), out_dims.size() - 1, axis));
-    if (axis < 0) {
-      axis += out_dims.size();
-    }
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* in_data = in->data<T>();
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    if (size == out_dims[axis]) {
-      if (reverse) {
-        thrust::device_ptr<const T> dev_ptr =
-            thrust::device_pointer_cast(in_data);
-        thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
-        if (exclusive) {
-          thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
-                                 out_data);
-        } else {
-          thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
-                                 out_data);
-        }
-        thrust::reverse(thrust::device, out_data, out_data + size);
-      } else {
-        if (exclusive) {
-          thrust::exclusive_scan(thrust::device, in_data, in_data + size,
-                                 out_data);
-        } else {
-          thrust::inclusive_scan(thrust::device, in_data, in_data + size,
-                                 out_data);
-        }
-      }
-      return;
-    }
-
-    size_t height = 1;
-    size_t width = 1;
-    for (size_t i = 0; i <= axis; i++) {
-      height *= out_dims[i];
-    }
-
-    for (size_t i = axis + 1; i < out_dims.size(); i++) {
-      width *= out_dims[i];
-    }
-    int scan_size = out_dims[axis];
-    bool transpose = (axis != out_dims.size() - 1);
-
-    int tile_size = 32;
-    dim3 blocks(32, 8);
-    dim3 transpose_grids((width + tile_size - 1) / tile_size,
-                         (height + tile_size - 1) / tile_size);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    framework::Tensor tmp;
-    tmp.Resize(out_dims);
-    auto* tmp_data = tmp.mutable_data<T>(context.GetPlace());
-    T* next_in_data = out_data;
-    T* next_out_data = tmp_data;
-    if (transpose) {
-      MatrixTranspose<T, 32,
-                      8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
-          out_data, in_data, height, width);
-      next_in_data = out_data;
-      next_out_data = tmp_data;
-    }
-    auto swap_ptr = [](T*& ptr1, T*& ptr2) {
-      T* tmp = ptr2;
-      ptr2 = ptr1;
-      ptr1 = tmp;
-    };
-    int outer_size = height / scan_size;
-    int inner_size = width;
-    // Consider the size of shared memory, here block size is 128
-    dim3 scan_grid(outer_size, inner_size);
-    dim3 reverse_grid = scan_grid;
-    if (reverse) {
-      if (transpose) {
-        reverse_grid.x = scan_grid.y;
-        reverse_grid.y = scan_grid.x;
-        MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-            next_in_data, next_out_data, scan_size, outer_size, inner_size);
-        if (!transpose) next_in_data = tmp_data;
-        swap_ptr(next_in_data, next_out_data);
-      } else {
-        MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-            in_data, out_data, scan_size, outer_size, inner_size);
-      }
-    }
-    if (!transpose && !reverse) {
-      BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-          out_data, in_data, outer_size, inner_size, scan_size, exclusive);
-
-    } else {
-      BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-          next_out_data, next_in_data, outer_size, inner_size, scan_size,
-          exclusive);
-    }
-    swap_ptr(next_in_data, next_out_data);
-    if (reverse) {
-      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
-          next_in_data, next_out_data, scan_size, outer_size, inner_size);
-      swap_ptr(next_in_data, next_out_data);
-    }
-    if (transpose) {
-      transpose_grids.x = (height + tile_size - 1) / tile_size;
-      transpose_grids.y = (width + tile_size - 1) / tile_size;
-      MatrixTranspose<T, 32,
-                      8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
-          next_out_data, next_in_data, width, height);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cumsum, ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CumCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 38bf53ca0aa1a2dddca4ac2d2043de10fcdb7830..d197e4362e96976661ab891929b4503977f52ff0 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/cum_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1ebafa54598574ae9027a4887639a2a1d27448ea..568c7982cfc7c07b9c7f840ccaa32e4025225122 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
+detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 7927410ef37862499aadf61d6e04c45af157f347..83cf6e5fd30f6bcad4870d1ebd18a50e21518dfe 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -93,7 +93,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
     if (score_size == 3) {
-      ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     } else {
       ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     }
@@ -545,11 +545,10 @@ class MultiClassNMS2Op : public MultiClassNMSOp {
   void InferShape(framework::InferShapeContext* ctx) const override {
     MultiClassNMSOp::InferShape(ctx);
 
-    auto box_dims = ctx->GetInputDim("BBoxes");
     auto score_dims = ctx->GetInputDim("Scores");
     auto score_size = score_dims.size();
     if (score_size == 3) {
-      ctx->SetOutputDim("Index", {box_dims[1], 1});
+      ctx->SetOutputDim("Index", {-1, 1});
     } else {
       ctx->SetOutputDim("Index", {-1, 1});
     }
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 511d8e0eed1065ae0cd2cec3a7bcf534cd3043ab..0d9fbf612f73c428fb8050fcfcc319ddafabe482 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,7 +9,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -240,8 +239,6 @@ REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
-                       ops::YoloBoxKernel<double>);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
deleted file mode 100644
index fb5c214a59e1274ffc30226bf49a068df960f414..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh,
-                            const int* anchors, const int n, const int h,
-                            const int w, const int an_num, const int class_num,
-                            const int box_num, int input_size_h,
-                            int input_size_w, bool clip_bbox, const float scale,
-                            const float bias, bool iou_aware,
-                            const float iou_aware_factor) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  T box[4];
-  for (; tid < n * box_num; tid += stride) {
-    int grid_num = h * w;
-    int i = tid / box_num;
-    int j = (tid % box_num) / grid_num;
-    int k = (tid % grid_num) / w;
-    int l = tid % w;
-
-    int an_stride = (5 + class_num) * grid_num;
-    int img_height = imgsize[2 * i];
-    int img_width = imgsize[2 * i + 1];
-
-    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
-                                iou_aware);
-    T conf = sigmoid<T>(input[obj_idx]);
-    if (iou_aware) {
-      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
-      T iou = sigmoid<T>(input[iou_idx]);
-      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-             pow(iou, static_cast<T>(iou_aware_factor));
-    }
-    if (conf < conf_thresh) {
-      continue;
-    }
-
-    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
-                                iou_aware);
-    GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
-                  input_size_w, box_idx, grid_num, img_height, img_width, scale,
-                  bias);
-    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
-
-    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
-                                  5, iou_aware);
-    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
-    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
-                      grid_num);
-  }
-}
-
-template <typename T>
-class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* img_size = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = sizeof(int) * anchors.size();
-    auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
-    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
-    const auto gplace = ctx.GetPlace();
-    const auto cplace = platform::CPUPlace();
-    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
-                 dev_ctx.stream());
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = img_size->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, boxes, static_cast<T>(0));
-    set_zero(dev_ctx, scores, static_cast<T>(0));
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
-
-    dim3 thread_num = config.thread_per_block;
-#ifdef WITH_NV_JETSON
-    if (config.compute_capability == 53 || config.compute_capability == 62) {
-      thread_num = 512;
-    }
-#endif
-
-    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
-                     ctx.cuda_device_context().stream()>>>(
-        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
-        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
-                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
deleted file mode 100644
index 2cd69c60b7c44d0557c23b8d1bd933650e8402c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                  int j, int an_idx, int grid_size_h,
-                                  int grid_size_w, int input_size_h,
-                                  int input_size_w, int index, int stride,
-                                  int img_height, int img_width, float scale,
-                                  float bias) {
-  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
-  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
-           grid_size_h;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size_w;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size_h;
-}
-
-HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
-                                    int an_num, int an_stride, int stride,
-                                    int entry, bool iou_aware) {
-  if (iou_aware) {
-    return (batch * an_num + an_idx) * an_stride +
-           (batch * an_num + an_num + entry) * stride + hw_idx;
-  } else {
-    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-  }
-}
-
-HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
-                                  int an_stride, int stride) {
-  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
-         hw_idx;
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
-                                        const int img_height,
-                                        const int img_width, bool clip_bbox) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  if (clip_bbox) {
-    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-    boxes[box_idx + 1] =
-        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                             ? boxes[box_idx + 2]
-                             : static_cast<T>(img_width - 1);
-    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                             ? boxes[box_idx + 3]
-                             : static_cast<T>(img_height - 1);
-  }
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
-                                      const int label_idx, const int score_idx,
-                                      const int class_num, const T conf,
-                                      const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-class YoloBoxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* imgsize = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    Tensor anchors_;
-    auto anchors_data =
-        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
-    std::copy(anchors.begin(), anchors.end(), anchors_data);
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = imgsize->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    memset(boxes_data, 0, boxes->numel() * sizeof(T));
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    memset(scores_data, 0, scores->numel() * sizeof(T));
-
-    T box[4];
-    for (int i = 0; i < n; i++) {
-      int img_height = imgsize_data[2 * i];
-      int img_width = imgsize_data[2 * i + 1];
-
-      for (int j = 0; j < an_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 4, iou_aware);
-            T conf = sigmoid<T>(input_data[obj_idx]);
-            if (iou_aware) {
-              int iou_idx =
-                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
-              T iou = sigmoid<T>(input_data[iou_idx]);
-              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-                     pow(iou, static_cast<T>(iou_aware_factor));
-            }
-            if (conf < conf_thresh) {
-              continue;
-            }
-
-            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 0, iou_aware);
-            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
-                          input_size_h, input_size_w, box_idx, stride,
-                          img_height, img_width, scale, bias);
-            box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
-                                clip_bbox);
-
-            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                          stride, 5, iou_aware);
-            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
-            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
-                              class_num, conf, stride);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 375ef4344f4741c947ef3134696d64cdae696780..f89ecd37222870f73d00870c9454bf5590d504e3 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,11 +19,17 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -172,7 +178,7 @@ template <typename DeviceContext, typename T>
 class DeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* det = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
       // checked in forward, pass
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (det(A)=0)
     if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       ddet->Resize(input->dims());
-      ddet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, ddet, static_cast<T>(0.0f));
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
+                   ddet);
       return;
     }
 
@@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
     // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
     // -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
     VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
     // Third: dA * |A|
-    auto mul_dA_detA = helper.Mul(*grad, *det);
+    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
     VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
 
     // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
 
     VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
 
@@ -331,7 +340,7 @@ template <typename DeviceContext, typename T>
 class SlogDeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* slogdet = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
               input->dims().size() - grad->dims().size()));
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
@@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
-      dslogdet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()),
+                   std::numeric_limits<T>::quiet_NaN(), dslogdet);
       return;
     }
 
@@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
     // inverse(A).conj().transpose(-2, -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).conj()
-    framework::Tensor conj_inverse_A;
-    conj_inverse_A.Resize(inverse_A.dims());
-    auto numel = input->numel();
-    auto* conj_data = conj_inverse_A.mutable_data<T>(context.GetPlace(),
-                                                     size_t(numel * sizeof(T)));
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
-    for_range(functor);
+    auto conj_inverse_A = phi::Conj<T>(dev_ctx, inverse_A);
 
     VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
 
     // Third: inverse(A).conj().transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, conj_inverse_A);
     VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
@@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     det_grad.Resize(det_grad.dims().reshape(det_grad_vec));
 
     // Fifth: unsqueeze(dslA, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(det_grad, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dslA) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
     VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();
 
     framework::TensorCopy(res, context.GetPlace(), dslogdet);
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 0160277dc79af50c555b1257e6ffa216b7b56b62..ac8c12bcd7ebaa6f47e8d3582887ac327a9f8957 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class DiagV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "DiagV2Grad");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class DiagV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("diag_v2_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
-                            PT_INFER_META(phi::DiagInferMeta));
-
-REGISTER_OPERATOR(
-    diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    DiagInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
+                            PD_INFER_META(phi::DiagInferMeta));
+
+REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
+                  ops::DiagV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::DiagV2GradOpMaker<paddle::imperative::OpBase>,
+                  DiagInferShapeFunctor);
+
+REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp,
+                  ops::DiagGradV2NoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc
index b419f629a1e635c5a463b732af3003e93a5674d6..bf3cc941539eaeb2e03f53eb2465532469be5697 100644
--- a/paddle/fluid/operators/diagonal_op.cc
+++ b/paddle/fluid/operators/diagonal_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,74 +23,6 @@ namespace operators {
 class DiagonalOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal");
-
-    int offset_ = ctx->Attrs().Get<int>("offset");
-    int axis1 = ctx->Attrs().Get<int>("axis1");
-    int axis2 = ctx->Attrs().Get<int>("axis2");
-
-    auto x_dims = ctx->GetInputDim("Input");
-    int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
-    int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
-
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::OutOfRange("Input's dim is out of range (expected at "
-                                     "least 2 dimensions, but got %ld).",
-                                     x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis1_, x_dims.size(),
-        platform::errors::OutOfRange(
-            "Attr(axis1) is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size()), (x_dims.size() - 1), axis1));
-    PADDLE_ENFORCE_LT(
-        axis2_, x_dims.size(),
-        platform::errors::OutOfRange(
-            "Attr(axis2) is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size()), (x_dims.size() - 1), axis2));
-    PADDLE_ENFORCE_NE(axis1_, axis2_,
-                      platform::errors::InvalidArgument(
-                          "The dimensions should not be identical "
-                          "%d vs %d.",
-                          axis1, axis2));
-
-    auto out_dims = vectorize(x_dims);
-    // from out_dims get the dim size of axis1_.
-    auto axis1_size = out_dims[axis1_];
-    auto axis2_size = out_dims[axis2_];
-    // delete two dims by attr axis1 and axis2 from out_dims.
-    /* example:
-       out_dim = [2, 3, 4];
-       axis1 = 0;
-       axis2 = 1;
-       according to the attr of axis1 and axis2, we get:
-       out_dim = [4].
-    */
-    out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
-    out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
-
-    if (offset_ == 0) {
-      out_dims.push_back(std::min(axis1_size, axis2_size));
-    } else if (offset_ > 0) {
-      if ((axis2_size - offset_) > 0) {
-        out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
-      } else {
-        out_dims.push_back(0);
-      }
-    } else {
-      if ((axis1_size + offset_) > 0) {
-        out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
-      } else {
-        out_dims.push_back(0);
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-  }
 };
 
 class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -170,9 +105,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
+                            PD_INFER_META(phi::DiagonalInferMeta));
+
 REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker,
                   ops::DiagonalGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>,
+                  DiagonalInferShapeFunctor);
 
 REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp,
                   ops::DiagonalGradNoNeedBufferVarsInferer)
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 3a53f1365567f99c9446077f7939d87c156c9a08..55b2484941293c8db47ef847bea959ebe82ff3ae 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor,
+                            PD_INFER_META(phi::DistInferMeta));
+
 REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker,
                   ops::DistGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DistGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DistGradOpMaker<paddle::imperative::OpBase>,
+                  DistInferShapeFunctor);
 REGISTER_OPERATOR(dist_grad, ops::DistOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
deleted file mode 100644
index dfd7e29a8d0102261746ab47d3e1e805a674d7b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dist_op.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-template <int Rank>
-static void GetBraodcastDims(const framework::DDim& x_dims,
-                             const framework::DDim& y_dims,
-                             Eigen::DSizes<int, Rank>* x_bcast_dims,
-                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
-  int bcast_dims_remainder = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (x_dims[i] >= y_dims[i]) {
-      (*x_bcast_dims)[i] = 1;
-      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
-      bcast_dims_remainder += x_dims[i] % y_dims[i];
-    } else {
-      (*y_bcast_dims)[i] = 1;
-      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-  }
-  PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
-                    platform::errors::PreconditionNotMet(
-                        "The input tensor of Op(dist) could not be broadcast, "
-                        "X's shape is [%s], Y's shape is [%s].",
-                        x_dims, y_dims));
-}
-
-static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) {
-  std::vector<int64_t> new_dims_vec(rank);
-  if (in_dims.size() < rank) {
-    for (int i = 0; i < rank - in_dims.size(); ++i) {
-      new_dims_vec[i] = 1;
-    }
-    for (int i = 0; i < in_dims.size(); ++i) {
-      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
-    }
-  } else {
-    new_dims_vec = vectorize(in_dims);
-  }
-  return phi::make_ddim(new_dims_vec);
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Output<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-  out->mutable_data<T>(context.GetPlace());
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-
-  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, 1>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  // p=0 means number of non-zero elements of (x-y)
-  // p=inf means the maximum of |x-y|
-  // p=-inf means the minimum of |x-y|
-  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
-  if (p == 0) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
-            .template cast<T>()
-            .sum();
-  } else if (p == INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .maximum();
-  } else if (p == -INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .minimum();
-  } else {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .pow(p)
-            .sum()
-            .pow(1.0 / p);
-  }
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistGradFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Input<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-
-  auto x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-  auto y_grad = context.Output<Tensor>(framework::GradVarName("Y"));
-  auto out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-  auto out_dims = context.Input<Tensor>("Out")->dims();
-
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-  framework::DDim out_new_dims = GetNewDims(out_dims, Rank);
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, Rank>::From(*out, out_new_dims);
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  Eigen::DSizes<int, Rank> out_bcast_dims;
-
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  std::vector<int64_t> new_dims_vec(Rank);
-  for (int i = 0; i < Rank; ++i) {
-    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
-    out_bcast_dims[i] = new_dims_vec[i];
-  }
-  framework::DDim new_dims = phi::make_ddim(new_dims_vec);
-
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  auto out_grad_t = EigenTensor<T, Rank>::From(*out_grad, out_new_dims);
-  framework::Tensor grad;
-  grad.mutable_data<T>(new_dims, context.GetPlace());
-  auto grad_t = EigenTensor<T, Rank>::From(grad);
-
-  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
-  auto x_minux_y_abs = x_minux_y.abs();
-  auto sign =
-      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
-      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
-  T epsilon = static_cast<T>(1.0e-10f);
-
-  // 1: Lp-norm(z), z = x-y, compute dz
-  if (p == 0) {
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, &grad, static_cast<T>(0));
-  } else if (p == INFINITY || p == -INFINITY) {
-    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
-    // j!=i, or equals to sign(z_i) * dout if j=i.
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  } else {
-    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  }
-
-  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
-  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
-  Eigen::DSizes<int, Rank> reduce_dims;
-  for (int i = 0; i < x_new_dims.size(); ++i) {
-    x_reshape_dims[2 * i] = x_bcast_dims[i];
-    x_reshape_dims[2 * i + 1] = x_new_dims[i];
-    y_reshape_dims[2 * i] = y_bcast_dims[i];
-    y_reshape_dims[2 * i + 1] = y_new_dims[i];
-    reduce_dims[i] = 2 * i;
-  }
-
-  // 2: if x or y is broadcasted in forward function,
-  // the grad need to be sum along the broadcasted dimensions
-  if (x_grad) {
-    x_grad->mutable_data<T>(context.GetPlace());
-    auto x_grad_t = EigenTensor<T, Rank>::From(*x_grad, x_new_dims);
-    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(x_grad_t.dimensions());
-  }
-  if (y_grad) {
-    y_grad->mutable_data<T>(context.GetPlace());
-    auto y_grad_t = EigenTensor<T, Rank>::From(*y_grad, y_new_dims);
-    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
-                                  .sum(reduce_dims)
-                                  .reshape(y_grad_t.dimensions());
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DistKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DistGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistGradFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
deleted file mode 100644
index c13bf687af23470d4595def6fb6fabf7385c999f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distribution_helper.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef __NVCC__
-#include <curand_kernel.h>
-#endif
-#ifdef __HIPCC__
-#include <hiprand_kernel.h>
-#endif
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/hostdevice.h"
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-#endif
-
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-namespace paddle {
-namespace distribution {
-
-using Tensor = framework::Tensor;
-
-/********************* Transformation Function **********************/
-template <typename T>
-struct exponential_transform {
-  explicit exponential_transform(T lambda) : lambda_(lambda) {}
-
-  HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (std::is_same<T, double>::value) {
-      return static_cast<T>(-1.0) / lambda_ * log(val);
-    } else {
-      return static_cast<T>(-1.0) / lambda_ * __logf(val);
-    }
-#else
-    return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
-#endif
-  }
-
- private:
-  T lambda_;
-};
-
-template <typename T>
-struct uniform_transform {
-  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
-
-  HOSTDEVICE inline T operator()(T val) const {
-    if (UNLIKELY(val == static_cast<T>(1.0))) {
-      return min_;
-    } else {
-      return val * range_ + min_;
-    }
-  }
-
- private:
-  T range_;
-  T min_;
-};
-
-template <typename T>
-struct normal_transform {
-  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
-
-  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
-
- private:
-  T mean_;
-  T std_;
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-namespace kps = phi::kps;
-
-/*********************** Distribution Function *************************/
-template <typename T>
-struct uniform_distribution;
-
-template <typename T>
-struct normal_distribution;
-
-#if defined(__NVCC__)
-template <>
-struct uniform_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct uniform_distribution<double> {
-  __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-template <>
-struct normal_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_normal4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct normal_distribution<double> {
-  __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_normal2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-#else
-template <>
-struct uniform_distribution<float> {
-  __device__ inline float4 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_uniform4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct uniform_distribution<double> {
-  __device__ inline double2 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_uniform2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-template <>
-struct normal_distribution<float> {
-  __device__ inline float4 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_normal4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct normal_distribution<double> {
-  __device__ inline double2 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_normal2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-#endif
-
-/******** Launch GPU function of distribution and transformation *********/
-template <typename T, typename DistOp, typename TransformOp>
-__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans, T *out_data,
-                                   size_t stride) {
-  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
-  static constexpr int kCount = DistOp::kReturnsCount;
-#if defined(__NVCC__)
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = curandStatePhilox4_32_10_t;
-#else
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = hiprandStatePhilox4_32_10_t;
-#endif
-  size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
-  T result[kCount];
-  for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
-                                                           trans);
-    kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, stride, 1);
-    __syncthreads();
-  }
-}
-
-template <typename T, typename DistOp, typename TransformOp>
-void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
-                                Tensor *out, DistOp dist, TransformOp trans) {
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-  auto size = out->numel();
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-  size_t block_size = 256;
-  size_t expect_grid_size = (size + block_size - 1) / block_size;
-  const auto &prop = platform::GetDeviceProperties(device_id);
-  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
-                         prop.multiProcessorCount;
-  size_t grid_size =
-      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
-
-  size_t total_thread = block_size * grid_size;
-  size_t curand4_loop_times =
-      (size + 4 * total_thread - 1) / (4 * total_thread);
-  // 'increment' shoulde be multiple of 4
-  uint64_t increment = curand4_loop_times * 4;
-
-  auto seed_offset = gen_cuda->IncrementOffset(increment);
-  uint64_t seed = seed_offset.first;
-  uint64_t offset = seed_offset.second;
-
-  DistributionKernel<
-      T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data, total_thread);
-}
-
-#endif
-
-}  // namespace distribution
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index a86a3bb35927d53d20bef91a0bf36695a268c348..8efdd15781a6f2dab48c0680ba87c7b427dc60ec 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -101,8 +101,8 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
-                            PT_INFER_META(phi::DotInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PD_INFER_META(phi::DotInferMeta));
 
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                   ops::DotOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90..144198367d538e178a745c22902bb77a65f45fe4 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,10 +32,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
@@ -86,8 +85,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                           bool is_upscale_in_train,
                                           uint64_t increment) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
 #ifdef PADDLE_WITH_HIP
   int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -102,7 +101,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
   MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
   for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
     LoadT src_val;
-    platform::Load<T, VecSize>(&src[i], &src_val);
+    phi::Load<T, VecSize>(&src[i], &src_val);
 
 #ifdef PADDLE_WITH_HIP
     float4 rand = hiprand_uniform4(&state);
@@ -126,8 +125,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
       }
     }
 
-    platform::Store<T, VecSize>(dst_val, &dst[i]);
-    platform::Store<MaskType, VecSize>(mask_val, &mask[i]);
+    phi::Store<T, VecSize>(dst_val, &dst[i]);
+    phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
   }
 }
 
@@ -153,16 +152,16 @@ __global__ void DropoutGradCUDAKernel(
     const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
     T* dx) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_val;
-    platform::Load<T, VecSize>(&dout[i], &dout_val);
+    phi::Load<T, VecSize>(&dout[i], &dout_val);
 
     MaskLoadT mask_val;
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_val);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
 
     LoadT dx_val;
 
@@ -172,27 +171,28 @@ __global__ void DropoutGradCUDAKernel(
                                  static_cast<MT>(mask_val[j]) * factor);
     }
 
-    platform::Store<T, VecSize>(dx_val, &dx[i]);
+    phi::Store<T, VecSize>(dx_val, &dx[i]);
   }
 }
 
 template <typename T>
-void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              bool is_test,
+void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
                               const std::string dropout_implementation,
                               float dropout_prob, bool upscale_in_train,
-                              bool is_fix_seed, int seed_val, const Tensor& x,
-                              const Tensor* seed, Tensor* mask, Tensor* y) {
+                              bool is_fix_seed, int seed_val,
+                              const framework::Tensor& x,
+                              const framework::Tensor* seed,
+                              framework::Tensor* mask, framework::Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
+  int64_t x_numel = x.numel();
+  auto stream = dev_ctx.stream();
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
 
   if (!is_test) {
-    int64_t x_numel = x.numel();
-    auto stream = dev_ctx.stream();
     auto* mask_data = mask->data<uint8_t>();
     size_t size = phi::product(mask->dims());
 
-    auto* x_data = x.data<T>();
-    auto* y_data = y->data<T>();
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -219,8 +219,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     uint64_t increment;
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
-    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+    int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
         ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
 
@@ -254,22 +255,37 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     }
 #endif
   } else {
-    auto X = EigenMatrix<T>::Reshape(x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
     if (upscale_in_train) {
-      Y.device(place) = X;
+// todo: can y share with data with x directly?
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                         hipMemcpyDeviceToDevice, stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                          cudaMemcpyDeviceToDevice, stream));
+#endif
     } else {
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      using MT = typename details::MPTypeTrait<T>::Type;
+      MT factor = static_cast<MT>(1.0f - dropout_prob);
+      std::vector<const framework::Tensor*> ins = {&x};
+      std::vector<framework::Tensor*> outs = {y};
+      auto functor = phi::funcs::ScaleFunctor<T>(factor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 }
 
 template <typename T>
-void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
+void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
-                                float dropout_prob, const Tensor& grad_y,
-                                const Tensor& mask, int64_t size,
-                                Tensor* grad_x, bool is_test = false) {
+                                float dropout_prob,
+                                const framework::Tensor& grad_y,
+                                const framework::Tensor& mask, int64_t size,
+                                framework::Tensor* grad_x,
+                                bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
   auto stream = dev_ctx.stream();
   MT factor;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index d7db7dddce3887ca25ea1df34048f15663b2e987..c62d45570ba291dc60120c393d21842cc6548c61 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
+inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const framework::Tensor* seed,
                                     const bool is_fix_seed, const int seed_val,
                                     const int offset, uint64_t* seed_data,
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 7613b04bccfdc2084decc0b383eec199f7e10991..6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
                   ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
deleted file mode 100644
index f6ddff1d0327d3c7961781f875da69f89df1edec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx = context.cuda_device_context();
-    auto* mask = context.Output<Tensor>("Mask");
-    mask->mutable_data<uint8_t>(context.GetPlace());
-
-    bool is_fix_seed = context.Attr<bool>("fix_seed");
-    int seed_val = context.Attr<int>("seed");
-    DropoutFwGPUKernelDriver<T>(dev_ctx, is_test, dropout_implementation,
-                                dropout_prob, upscale_in_train, is_fix_seed,
-                                seed_val, *x, seed, mask, y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUDropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-    auto size = grad_x->numel();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    DropoutGradGPUKernelDriver<T>(dev_ctx, dropout_implementation, dropout_prob,
-                                  *grad_y, *mask, size, grad_x, is_test);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
deleted file mode 100644
index ea6ed0e61947470c22f18e47acce2fca4cb9c41f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/dropout_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = phi::product(mask->dims());
-
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
-        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
-        return;
-      }
-      // std::minstd_rand engine;
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      int seed_data = 0;
-      if (seed) {
-        seed_data = *(seed->data<int>());
-      } else {
-        seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
-      }
-      auto engine = framework::GetCPURandomEngine(seed_data);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(*engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          if (upscale_in_train) {
-            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
-          } else {
-            y_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
-      } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto dX = EigenVector<T>::Flatten(*grad_x);
-    auto dY = EigenVector<T>::Flatten(*grad_y);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    if (context.Attr<bool>("is_test") == true) {
-      if (dropout_implementation == "upscale_in_train") {
-        dX.device(place) = static_cast<T>(1) * dY;
-      } else {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
-      }
-    } else {
-      auto M = EigenVector<uint8_t>::Flatten(*mask);
-      if (dropout_implementation == "upscale_in_train") {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        if (dropout_prob == 1.0f) {
-          dX.device(place) = static_cast<T>(0) * dY;
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
-      } else {
-        dX.device(place) = dY * M.cast<T>();
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 6aae566760623c666f3ce82a890a119e3e173390..07b3b5381162575cbfc03dd8cc10d0c88a2d21e8 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 206d9a6c5e9c9869216f0a6c137accc931aa2a77..bdf08646f1d8b94d6d8d141d8a9fa9864cdc937b 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 07b7e2cc7c09b09d6640f49fce438d58d0cc9cf2..7d8660f238abc8446b2988aad24a64c565e01ef9 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/dropout_op.h"
+
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
 #ifdef PADDLE_WITH_XPU
 
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class DropoutXPUKernel : public framework::OpKernel<T> {
   using XPUTyp = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index e9c6c1eb7eced7b74294b679ce88a7eb76e90440..5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -18,12 +18,19 @@
 #include <algorithm>
 #include <complex>
 #include "paddle/fluid/operators/math/matrix_solve.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
 #define EPSILON 1e-6
 
 namespace paddle {
@@ -214,12 +221,17 @@ class EigKernel : public framework::OpKernel<T> {
 
       ApplyEigKernel<DeviceContext, phi::dtype::Real<T>>(
           *x, &real_values, &real_vectors, context);
-      auto dito = math::DeviceIndependenceTensorOperations<
-          DeviceContext, phi::dtype::Real<T>, Tout>(context);
+
+      auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+      auto& dev_ctx = static_cast<
+          const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+          orig_dev_ctx);
 
       // 1. extract real part & imag part from real_values
-      Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
-      Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
+      Tensor real_part =
+          phi::funcs::Slice<T>(dev_ctx, real_values, {-1}, {0}, {order});
+      Tensor imag_part = phi::funcs::Slice<T>(dev_ctx, real_values, {-1},
+                                              {order}, {order * 2});
 
       // 2. construct complex values
       auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
@@ -233,7 +245,8 @@ class EigKernel : public framework::OpKernel<T> {
       for_range(functor);
 
       // 3. construct complex vectors
-      Tensor real_vector_trans = dito.Transpose(real_vectors);
+      Tensor real_vector_trans =
+          phi::TransposeLast2Dim<T>(dev_ctx, real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
       ConstructComplexVectors<phi::dtype::Real<T>, Tout>(
@@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename Tout>
+template <typename DeviceContext, typename T>
 void ComputeBackwardForComplexInput(
     const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV,
-    Tout* x_grad_data, int batch_count, int order,
+    T* x_grad_data, int batch_count, int order,
     const framework::ExecutionContext& context) {
-  auto dito =
-      math::DeviceIndependenceTensorOperations<DeviceContext, Tout, Tout>(
-          context);
-
-  Tensor trans_v = dito.Transpose(V);
-  Tensor Vh = dito.Conj(trans_v);
-  Tensor Lconj = dito.Conj(L);
-  Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1));
-  Tensor VhgV = dito.Matmul(Vh, gV);
-  Tensor diag_real = dito.Real(VhgV);
-  Tensor diag_res = dito.BatchDiag(diag_real, batch_count);
-  Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2);
+  auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+  auto& dev_ctx = static_cast<
+      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+      orig_dev_ctx);
+
+  Tensor trans_v = phi::TransposeLast2Dim<T>(dev_ctx, V);
+  Tensor Vh = phi::Conj<T>(dev_ctx, trans_v);
+  Tensor Lconj = phi::Conj<T>(dev_ctx, L);
+  Tensor Econj = phi::Subtract<T>(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2),
+                                  phi::funcs::Unsqueeze(Lconj, -1));
+  Tensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV);
+  Tensor diag_real = phi::Real<T>(dev_ctx, VhgV);
+  Tensor diag_res = phi::funcs::BatchDiag<T>(dev_ctx, diag_real, batch_count);
+  Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
 
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<Tout>>();
-  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
+  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<T>>();
+  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<T>(
       diag_unsqueezed.dims(), context.GetPlace(),
-      static_cast<size_t>(numel * sizeof(Tout)));
-  auto& dev_ctx = context.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  phi::funcs::RealToComplexFunctor<Tout> functor(data_diag_un, data_diag_un_com,
-                                                 numel);
+      static_cast<size_t>(numel * sizeof(T)));
+
+  platform::ForRange<DeviceContext> for_range(orig_dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(data_diag_un, data_diag_un_com,
+                                              numel);
   for_range(functor);
   // real tensor multiply complex tensor in broadcast manner
-  Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex);
-  Tensor res2 = dito.Matmul(Vh, res1);
-  Tensor result = dito.Sub(VhgV, res2);
+  Tensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
+  Tensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
+  Tensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
 
-  result.mutable_data<Tout>(V.dims(), context.GetPlace());
-  result = dito.Div(result, Econj);
-  result = dito.DiagFill(order, order, order, 0, gL, result);
-  Tensor rhs = dito.Matmul(result, Vh);
+  result.mutable_data<T>(V.dims(), context.GetPlace());
+  result = phi::Divide<T>(dev_ctx, result, Econj);
+  result =
+      phi::funcs::DiagFill<T, T>(dev_ctx, order, order, order, 0, gL, result);
+  Tensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
 
   // solve linear system
   // solve(Vh, rhs, out, m, k)
@@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput(
   // x_grad: out
   int m = Vh.dims()[Vh.dims().size() - 1];
   int k = rhs.dims()[rhs.dims().size() - 1];
-  auto* matrix_data = Vh.data<Tout>();
-  auto* rhs_data = rhs.data<Tout>();
-  math::SolveLinearSystem<Tout>(matrix_data, rhs_data, x_grad_data, m, k,
-                                batch_count);
+  auto* matrix_data = Vh.data<T>();
+  auto* rhs_data = rhs.data<T>();
+  math::SolveLinearSystem<T>(matrix_data, rhs_data, x_grad_data, m, k,
+                             batch_count);
 }
 
 template <typename DeviceContext, typename T, typename Tout>
diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc
index 553d0e679cc6ddebd68c3edbc2de70209364bb53..4e33c567eb6d12fc504bfd76bc83072836feda21 100644
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eigh_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,42 +25,9 @@ using framework::Tensor;
 class EighOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues",
-                   "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors",
-                   "Eigh");
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto rank = input_dim.size();
-
-    PADDLE_ENFORCE_GE(rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions."
-                          "But received a %d dimension tensor.",
-                          rank));
-    PADDLE_ENFORCE_EQ(
-        input_dim[rank - 2], input_dim[rank - 1],
-        platform::errors::InvalidArgument(
-            "Eigh op is designed for square matrix, consequently"
-            "inner-most 2 dimensions of Input(X) should be symmetric."
-            "But received X's shape[-2] = %d and shape[-1] = %d.",
-            input_dim[rank - 2], input_dim[rank - 1]));
-
-    std::vector<int64_t> values_dim;
-
-    for (auto i = 0; i < rank - 1; i++) {
-      values_dim.emplace_back(input_dim[i]);
-    }
-
-    ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim));
-    ctx->SetOutputDim("Eigenvectors", input_dim);
-  }
 };
 
-class EignOpMaker : public framework::OpProtoAndCheckerMaker {
+class EighOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
@@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor,
+                            PD_INFER_META(phi::EighInferMeta));
 
-REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker,
+REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker,
                   ops::EighGradOpMaker<paddle::framework::OpDesc>,
-                  ops::EighGradOpMaker<paddle::imperative::OpBase>);
+                  ops::EighGradOpMaker<paddle::imperative::OpBase>,
+                  EighInferShapeFunctor);
 REGISTER_OPERATOR(eigh_grad, ops::EighGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu
deleted file mode 100644
index 827c551637d4df24529508ff37e6a92f157658a0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eigh_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eigh_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
deleted file mode 100644
index 5279ec750935c9b1b01584e893cc5e5f85d4a75c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eigh_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/eigen_values_vectors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EighKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
-    auto output_w = ctx.Output<Tensor>("Eigenvalues");
-    auto output_v = ctx.Output<Tensor>("Eigenvectors");
-    std::string lower = ctx.Attr<std::string>("UPLO");
-    bool is_lower = (lower == "L");
-    math::MatrixEighFunctor<DeviceContext, T> functor;
-    functor(ctx, *input, output_w, output_v, is_lower, true);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class EighGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = phi::dtype::Real<T>;
-    auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    x_grad.mutable_data<T>(ctx.GetPlace());
-    auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
-    auto& output_v = *ctx.Input<Tensor>("Eigenvectors");
-    auto& output_w_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvalues"));
-    auto& output_v_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvectors"));
-
-    auto& dims = output_v.dims();
-    const int m = dims[dims.size() - 1];
-    auto dito =
-        math::DeviceIndependenceTensorOperations<DeviceContext, T, ValueType>(
-            ctx);
-    auto tV = dito.Transpose(dito.Conj(output_v));
-    auto W = dito.template Sub<ValueType>(dito.Unsqueeze(output_w, -2),
-                                          dito.Unsqueeze(output_w, -1));
-    Tensor result = dito.Matmul(tV, output_v_grad);
-    result.mutable_data<T>(dims, ctx.GetPlace());
-    std::vector<int> out_shape = phi::vectorize<int>(dims);
-    auto constant = dito.Fill(out_shape, 0.5);
-    result = dito.Sub(result, dito.Conj(dito.Transpose(result)));
-    result = dito.Mul(result, constant);
-    result = dito.Div(result, W);
-    result = dito.DiagFill(m, m, m, 0, output_w_grad, result);
-    x_grad = dito.Matmul(output_v, dito.Matmul(result, tV));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 38cd232e4d1d2237cb5da014d11ba69a91cbe917..13fd9b81a8765aea140ad6ca2fc0383151a51dc7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -102,42 +102,6 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad,
                   ops::ElementwiseDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_div)
     .AddCheckpoint(
         R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
deleted file mode 100644
index 9eb4b0352e5337e3fdd758d2e95cfa61d1d62724..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  const auto place = ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::float16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index c58a7f36548a57a1c8e7770fa282470fba4cc140..e9adb9abdb528c187817be641b81ffb6f64833b0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -20,142 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-void default_elementwise_sub(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          SubFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseSubFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void default_elementwise_div(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          DivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseDivFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
-    phi::DivideRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *pt_x.get(), *pt_y.get(), axis, pt_z.get());
-  }
-};
-
-template <typename T>
-struct DivGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
-};
-
-template <typename T>
-struct DivGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout / y_conj;
-  }
-};
-
-template <typename T>
-struct DivGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * out / y;
-  }
-};
-
-template <typename T>
-struct DivGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> out_div_y_conj((out / y).real,
-                                                -(out / y).imag);
-    return -dout * out_div_y_conj;
-  }
-};
-
-template <typename T>
-struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-
-  ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseDivGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::Tensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Input<Tensor>("Out");
-    auto* ddX = ctx.Input<Tensor>("DDX");
-    auto* ddY = ctx.Input<Tensor>("DDY");
-    auto* dX = ctx.Input<Tensor>("DX");
-
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dOut = ctx.Output<Tensor>("DOut");
-    auto* ddOut = ctx.Output<Tensor>("DDOut");
-
-    int axis = ctx.Attr<int>("axis");
-
-    if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    // ddX_safe == null ? 0 : ddX
-    // ddY_safe == null ? 0 : ddY
-    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dX, ddX, &ddX_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
-
-    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    // dOut = - dX * ddY
-    // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
-    // inplace ddx
-    Tensor tmp;
-    if (dOut) {
-      tmp = *dOut;
-    } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      tmp = ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
-    }
-    if (dY) {
-      // dX_div_Y = dX / Y;
-      Tensor dX_div_Y = tmp;
-      default_elementwise_div<DeviceContext, T>(ctx, dX, Y, &dX_div_Y);
-
-      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-
-      // dY = Out * dX * ddY / Y - dX * ddX / Y
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
-          ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
-          DivGradDX<T>(), DivDoubleDY<T>());
-    }
-
-    if (ddOut) {
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
-      default_elementwise_sub<DeviceContext, T>(ctx, &ddX_safe, &tmp, &tmp);
-      default_elementwise_div<DeviceContext, T>(ctx, &tmp, Y, ddOut);
-    }
-
-    if (dOut) {
-      // dOut = - dX * ddY
-      default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*dOut);
-      dout.device(place) = static_cast<T>(-1) * dout;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 86f5be3071c2d1a84f13da1cef74787003e633bb..54931d99292f9d1453e2a3deb72e75ed63c9f46f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -90,147 +87,6 @@ struct MinFunctor {
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
-template <typename InT, typename OutT>
-struct DivGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    // dx = dout / y
-    // dy = - dout * out / y
-    phi::Array<OutT, 2> outs;
-    outs[0] = a / c;
-    outs[1] = -a * b / c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    Complex<InT> c_conj(c.real, -c.imag);
-    Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
-    outs[0] = a / c_conj;
-    outs[1] = -a * out_div_c_conj;
-    return outs;
-  }
-};
-
-// Float div grad
-template <typename T>
-struct DivGradXFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-
-// Complex div grad
-template <typename T>
-struct DivGradXFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a / b_conj;
-  }
-};
-
-// Float mul and div
-template <typename T>
-struct DivGradYFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
-    return -a * b / c;
-  }
-};
-
-// Complex mul and div
-template <typename T>
-struct DivGradYFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b,
-                                          const Complex<T> c) const {
-    Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
-    return -a * out_div_c_conj;
-  }
-};
-
-// Fmax
-template <typename T>
-struct FMaxFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return std::fmax(a, b);
-  }
-};
-
-template <>
-struct FMaxFunctor<paddle::platform::float16> {
-  inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16 a,
-      const paddle::platform::float16 b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmax(float_a, float_b);
-    return static_cast<paddle::platform::float16>(result);
-  }
-};
-
-template <>
-struct FMaxFunctor<int> {
-  inline HOSTDEVICE int operator()(const int a, const int b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmax(float_a, float_b);
-    return std::lrint(result);
-  }
-};
-
-template <>
-struct FMaxFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
-    double double_a = static_cast<double>(a);
-    double double_b = static_cast<double>(b);
-    auto result = std::fmax(double_a, double_b);
-    return std::llrint(result);
-  }
-};
-
-// Fmin
-template <typename T>
-struct FMinFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return std::fmin(a, b);
-  }
-};
-
-template <>
-struct FMinFunctor<paddle::platform::float16> {
-  inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16 a,
-      const paddle::platform::float16 b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmin(float_a, float_b);
-    return static_cast<paddle::platform::float16>(result);
-  }
-};
-
-template <>
-struct FMinFunctor<int> {
-  inline HOSTDEVICE int operator()(const int a, const int b) const {
-    float float_a = static_cast<float>(a);
-    float float_b = static_cast<float>(b);
-    auto result = std::fmin(float_a, float_b);
-    return std::lrint(result);
-  }
-};
-
-template <>
-struct FMinFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
-    double double_a = static_cast<double>(a);
-    double double_b = static_cast<double>(b);
-    auto result = std::fmin(double_a, double_b);
-    return std::llrint(result);
-  }
-};
-
 template <typename T>
 struct MinGradXFunctor {
   inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
@@ -257,47 +113,6 @@ struct MinGradXYFunctor {
   }
 };
 
-template <typename T>
-struct MulGradFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
-};
-template <typename T>
-struct MulGradFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a * b_conj;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * y
-    outs[0] = a * b;
-    // dy = dout * x
-    outs[1] = a * c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    // dx = dout * y
-    Complex<InT> b_conj(b.real, -b.imag);
-    outs[0] = a * b_conj;
-    // dy = dout * x
-    Complex<InT> c_conj(c.real, -c.imag);
-    outs[1] = a * c_conj;
-    return outs;
-  }
-};
-
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 91da732ef0d3dfbda5d9b7734071ec5831bcfa3f..d91315cc511aa80c0e9c44ccc688b2746eac764e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp,
                   ops::ElementwiseFMaxGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmax,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmax_grad,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 123332a4a23de5c9534c8523993b87d8738f9869..0d5f56fda17322d86ef13990e9fc2432816dc9cb 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmax,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmax_grad,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index cff30be50a3d14c646cb7d13d6d8aeeb3de250f4..afe1073d89a06618af95490ac6d264073bd930d4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseFMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<FMaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                           FMaxFunctor<T>(), z);
-  }
-};
-
 template <typename T>
 struct MaxGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
   }
 };
 
-template <typename T>
-struct FMaxGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>((x >= y) || isnan(y));
-  }
-};
-
-template <>
-struct FMaxGradDx<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      (x >= y) || paddle::platform::isnan(y));
-  }
-};
-
-template <>
-struct FMaxGradDx<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>((x >= y));
-  }
-};
-
-template <>
-struct FMaxGradDx<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>((x >= y));
-  }
-};
-
-template <typename T>
-struct FMaxGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(!((x >= y) || isnan(y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      !((x >= y) || paddle::platform::isnan(y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>(!((x >= y)));
-  }
-};
-
-template <>
-struct FMaxGradDy<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>(!((x >= y)));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseFMaxGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, FMaxGradDx<T>, FMaxGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx<T>(),
-        FMaxGradDy<T>());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 3a1951999546eb859f6299b0bf7b064ff1b90a1a..dad80a2c33f3abfde457a6d750f89e47374fae13 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp,
                   ops::ElementwiseFMinGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmin,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_fmin_grad,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index 5af985567d898d500b59e10d6032be57871c7e98..fb8bc9ac7f83c8dd99e40685acc68eec4c77b3ce 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmin,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_fmin_grad,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index 88fb044d42206eb0f89ac84df166e2e7ff33c5b3..283ad2adde978680d4d0c3a579d55e588368a28e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseFMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<FMinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                           FMinFunctor<T>(), z);
-  }
-};
-
 template <typename T>
 struct MinGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
     ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
-
-template <typename T>
-struct FMinGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>((x <= y) || isnan(y));
-  }
-};
-
-template <>
-struct FMinGradDx<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      (x <= y) || paddle::platform::isnan(y));
-  }
-};
-
-template <>
-struct FMinGradDx<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>((x <= y));
-  }
-};
-
-template <>
-struct FMinGradDx<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>((x <= y));
-  }
-};
-
-template <typename T>
-struct FMinGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * static_cast<T>(!((x <= y) || isnan(y)));
-  }
-};
-
-template <>
-struct FMinGradDy<paddle::platform::float16> {
-  HOSTDEVICE paddle::platform::float16 operator()(
-      paddle::platform::float16 x, paddle::platform::float16 y,
-      paddle::platform::float16 out, paddle::platform::float16 dout) const {
-    return dout * static_cast<paddle::platform::float16>(
-                      !((x <= y) || paddle::platform::isnan(y)));
-  }
-};
-
-template <>
-struct FMinGradDy<int> {
-  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
-    return dout * static_cast<int>(!((x <= y)));
-  }
-};
-
-template <>
-struct FMinGradDy<int64_t> {
-  HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
-                                int64_t dout) const {
-    return dout * static_cast<int64_t>(!((x <= y)));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseFMinGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, FMinGradDx<T>, FMinGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx<T>(),
-        FMinGradDy<T>());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index e172279145e28c0731ed0d8d91769d0b293662fe..830e09eeae4811eb44bd4e21e17fe83ee44c592d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL(
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 45c87a27a180af4798a9f8b31e2edfd0cacb583d..f7b9fd1e265f5d3f107e734f9ffdcc90e7f6cc77 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -63,33 +63,6 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y, x};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, MulGradFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, x};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dy, MulGradFunctor<T>());
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c81266d584468f51030026e1423a649252001f58..58a3123c7e332f50b0830577436528f1e8df1cdf 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-template <typename T>
-struct MulGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-};
-
-template <typename T>
-struct MulGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <typename T>
-struct MulGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-};
-
-template <typename T>
-struct MulGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    // ddout = ddx * y + x * ddy
-    // change computation sequence to save memory, so ddout can inplace ddx and
-    // dx can be used as 'tmp' tensor
-    // (1) dx = x * ddy
-    // (2) dy = dout * ddx
-    // (3) ddout = ddx * y
-    // (4) ddout = ddout + dx
-    // (5) dx = dout * ddy
-    if (ddout) {
-      int axis = ctx.Attr<int>("axis");
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
-      if (ddout->numel() > ddx->numel()) {
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX<T>(),
-            MulGradDY<T>());
-
-        Tensor ddout_tmp;
-        ddout_tmp.mutable_data<T>(ddout->dims(), ctx.GetPlace());
-
-        default_elementwise_mul<DeviceContext, T>(ctx, y, &ddx_safe, ddout);
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, x,
-                                                  &ddout_tmp);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      } else {
-        // use dx to save memory, other than alloc tmp tensor
-        Tensor* ddout_tmp = dx;
-
-        default_elementwise_mul<DeviceContext, T>(ctx, x, &ddy_safe, ddout_tmp);
-        // NOTE: in the following ElemwiseGradCompute, for the
-        // first output tensor is nullptr, the branch to calculate first
-        // output tensor will not be activated, DivGradDx function will not
-        // be called and can be ignored, the first branch has little effect
-        // on running speed.
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy,
-            MulGradDX<T>(), MulGradDY<T>());
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, y, ddout);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(*ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-        default_elementwise_mul<DeviceContext, T>(ctx, dout, &ddy_safe, dx);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    // get input
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>("DOut");
-    auto* ddx = ctx.Input<framework::Tensor>("DDX");
-    auto* ddy = ctx.Input<framework::Tensor>("DDY");
-
-    auto* d_dx = ctx.Input<framework::Tensor>("D_DX");
-    auto* d_dy = ctx.Input<framework::Tensor>("D_DY");
-    auto* d_ddout = ctx.Input<framework::Tensor>("D_DDOut");
-
-    // get output
-    auto* out_d_x = ctx.Output<framework::Tensor>("D_X");
-    auto* out_d_y = ctx.Output<framework::Tensor>("D_Y");
-    auto* out_d_dout = ctx.Output<framework::Tensor>("D_DOut");
-
-    auto* out_d_ddx = ctx.Output<framework::Tensor>("D_DDX");
-    auto* out_d_ddy = ctx.Output<framework::Tensor>("D_DDY");
-
-    if (out_d_x) out_d_x->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_y) out_d_y->mutable_data<T>(y->dims(), ctx.GetPlace());
-    if (out_d_dout) out_d_dout->mutable_data<T>(dout->dims(), ctx.GetPlace());
-    if (out_d_ddx) out_d_ddx->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_ddy) out_d_ddy->mutable_data<T>(y->dims(), ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    if (d_ddout) {
-      if (out_d_x) {
-        // out_d_x = ddy * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_ddout,
-                                                  out_d_x);
-      }
-      if (out_d_y) {
-        // out_d_y = ddx * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, d_ddout,
-                                                  out_d_y);
-      }
-    }
-
-    if (out_d_dout) {
-      // get out_d_dout
-      // out_d_dout = ddy * d_dx + d_dy * ddx
-      Tensor out_d_dout_tmp;
-      out_d_dout_tmp.mutable_data<T>(dout->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, d_dy, &ddx_safe,
-                                                out_d_dout);
-      default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_dx,
-                                                &out_d_dout_tmp);
-      auto out_d_dout_t = framework::EigenVector<T>::Flatten(*out_d_dout);
-      auto out_d_dout_tmp_t =
-          framework::EigenVector<T>::Flatten(out_d_dout_tmp);
-      out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t;
-    }
-
-    if (out_d_ddx) {
-      // get out_d_ddx
-      // out_d_ddx = dout * d_dy + y * d_ddout
-      Tensor out_d_ddx_tmp;
-      out_d_ddx_tmp.mutable_data<T>(ddx->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dy, out_d_ddx);
-      default_elementwise_mul<DeviceContext, T>(ctx, y, d_ddout,
-                                                &out_d_ddx_tmp);
-      auto out_d_ddx_t = framework::EigenVector<T>::Flatten(*out_d_ddx);
-      auto out_d_ddx_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddx_tmp);
-      out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t;
-    }
-
-    if (out_d_ddy) {
-      // get out_d_ddy
-      // out_d_ddy = dout * d_dx + x * d_ddout
-      Tensor out_d_ddy_tmp;
-      out_d_ddy_tmp.mutable_data<T>(ddy->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dx, out_d_ddy);
-      default_elementwise_mul<DeviceContext, T>(ctx, x, d_ddout,
-                                                &out_d_ddy_tmp);
-      auto out_d_ddy_t = framework::EigenVector<T>::Flatten(*out_d_ddy);
-      auto out_d_ddy_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddy_tmp);
-      out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t;
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 61862aa9f87408048c5d31a13c0be8a013046902..80b07721f0b4d1feb669bfce91127b0887d79391 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -45,6 +45,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
 #endif
 
@@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                          const framework::Tensor &dout, int axis,
                          framework::Tensor *dx, framework::Tensor *dy,
                          DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
-                                               Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  } else {
-    phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  }
+  phi::funcs::ElemwiseGradCompute<DeviceContext, T, DX_OP, DY_OP, Tout>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
 }
 
 // It is a common implementation to compute binary calculation with the support
@@ -1174,14 +1167,6 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
 }
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
-                   framework::Tensor *src, framework::Tensor *dst) {
-  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
-  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
-      dev_ctx.stream());
-}
 
 template <ElementwiseType ET, typename T, typename Functor>
 void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
@@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
                      std::vector<const framework::Tensor *> ins,
                      const framework::Tensor *dout, framework::Tensor *dx,
                      framework::Tensor *dy, Functor func) {
-  framework::Tensor tmp_dx;
-  framework::Tensor tmp_dy;
-  dx->mutable_data<T>(place);
-  dy->mutable_data<T>(place);
-  std::vector<framework::Tensor *> outs;
-  if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
-    outs = {dx, dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, dy};
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    outs = {dx, &tmp_dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, &tmp_dy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(
-      dev_ctx, ins, &outs, axis, func);
-
-  if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  }
+  phi::GetGradXAndYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dx, dy,
+                                       func);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
                     std::vector<const framework::Tensor *> ins,
                     const framework::Tensor *dout, framework::Tensor *dxy,
                     Functor func) {
-  framework::Tensor tmp_dxy;
-  dxy->mutable_data<T>(place);
-
-  std::vector<framework::Tensor *> outs;
-  if (dxy->dims() != dout->dims()) {
-    tmp_dxy.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dxy};
-  } else {
-    outs = {dxy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs,
-                                                           axis, func);
-  if (dxy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
-  }
+  phi::GetGradXOrYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dxy,
+                                      func);
 }
 
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 1f8a95f0286bd3bb228bcda59e1198bf0763eb9a..3e9263fe93acd93638ff9e496203b7ea432cea86 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -33,7 +32,7 @@ namespace p = paddle::platform;
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP(elementwise_sub);
+USE_OP_ITSELF(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
index 14b20baae1b0398a40ee74a3e16c2c992a4b557e..78855dd39572539e531bcd8ad3786ae95269ca8f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index b2cef95d1a349d66161db1c3edf7c14bc8a6d058..d15a7c272757fa683f835215e3db9ccec956af38 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -78,10 +76,16 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub);
 
 namespace ops = paddle::operators;
 
+REGISTER_OPERATOR(elementwise_sub, ::paddle::operators::ElementwiseOp,
+                  ::paddle::operators::ElementwiseSubOpMaker,
+                  ::paddle::operators::ElementwiseOpInferVarType,
+                  elementwise_subGradMaker<::paddle::framework::OpDesc>,
+                  elementwise_subGradMaker<::paddle::imperative::OpBase>,
+                  ::paddle::operators::ElementwiseOpInplaceInferer);
+
 REGISTER_OPERATOR(
     elementwise_sub_grad, ops::ElementwiseOpGrad,
     ops::ElementwiseGradOpInplaceInferer, ops::ElementwiseGradNoBufVarsInferer,
@@ -92,51 +96,6 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad,
                   ops::ElementwiseDoubleGradOpInplaceInferer,
                   ops::ElementwiseDoubleGradNoBufVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int16_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_sub)
     .AddCheckpoint(
         R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
deleted file mode 100644
index 2c962af9877b978f7a6af25635f345c0ae5ffd27..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
deleted file mode 100644
index 15c547b493ae045c13ab8d6b14a646cb92716a92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    phi::SubtractRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, *y, axis, z);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    phi::SubtractGradKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, *y, *dout, axis, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-    int axis = ctx.Attr<int>("axis");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    paddle::optional<const phi::DenseTensor&> ddx_optional = paddle::none;
-    paddle::optional<const phi::DenseTensor&> ddy_optional = paddle::none;
-    if (ddx != nullptr) {
-      ddx_optional = *ddx;
-    }
-    if (ddy != nullptr) {
-      ddy_optional = *ddy;
-    }
-    phi::SubtractDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *y, ddx_optional, ddy_optional, *dout, axis, ddout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index b68d38d6df12a5d11f57b1556f8fc7ceec00d3e0..4169a938f2d0bff0cf8b23db35c943c9ff586212 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
index d12c6fc30cebaafd27c099ab708e0662477cb017..87c494b0e10bad64566b5248946c9b8b1b778f2f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "xpu/refactor/math.h"
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 763fc5f2674104a718e33f5ef5ac7b2a1a7b23f5..ad8fd317013908e8908dff8bea3440e24779454e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -32,6 +32,45 @@ using dnnl::stream;
 
 template <typename T, dnnl::algorithm BINARY_OP>
 class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
+ private:
+  dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const {
+    dnnl::post_ops post_operations;
+    if (ctx.HasAttr("activation_type")) {
+      const float scale = ctx.HasAttr("activation_scale")
+                              ? ctx.Attr<float>("activation_scale")
+                              : 1.0f;
+      const float alpha = ctx.HasAttr("activation_alpha")
+                              ? ctx.Attr<float>("activation_alpha")
+                              : 0.0f;
+      const float beta = ctx.HasAttr("activation_beta")
+                             ? ctx.Attr<float>("activation_beta")
+                             : 0.0f;
+
+      static std::unordered_map<std::string, dnnl::algorithm> algo_map = {
+          {"relu", dnnl::algorithm::eltwise_relu},
+          {"tanh", dnnl::algorithm::eltwise_tanh},
+          {"leaky_relu", dnnl::algorithm::eltwise_relu},
+          {"swish", dnnl::algorithm::eltwise_swish},
+          {"hardswish", dnnl::algorithm::eltwise_hardswish},
+          {"sqrt", dnnl::algorithm::eltwise_sqrt},
+          {"abs", dnnl::algorithm::eltwise_abs},
+          {"clip", dnnl::algorithm::eltwise_clip},
+          {"gelu", dnnl::algorithm::eltwise_gelu_erf},
+          {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
+          {"relu6", dnnl::algorithm::eltwise_bounded_relu},
+          {"sigmoid", dnnl::algorithm::eltwise_logistic}};
+
+      const auto& activation_type =
+          algo_map.find(ctx.Attr<std::string>("activation_type"));
+
+      if (activation_type != algo_map.end()) {
+        post_operations.append_eltwise(scale, activation_type->second, alpha,
+                                       beta);
+      }
+    }
+    return post_operations;
+  }
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto& dev_ctx =
@@ -47,9 +86,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     float scale_o = ctx.Attr<float>("Scale_out");
     int axis = ctx.Attr<int>("axis");
 
-    platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine,
-                                             ctx.GetPlace(), x, y, z, scale_x,
-                                             scale_y, scale_o);
+    platform::BinaryMKLDNNHandler<T> handler(
+        BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x,
+        scale_y, scale_o, get_post_ops(ctx));
 
     const auto src_x_memory = handler.AcquireSrcMemory(x);
     const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index 5222103256d614a2d6b1fa10662367ecb20d3cb2..ea009a38056f078689bd6dc4c9a41d2b34e8c1fa 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -17,8 +17,13 @@
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
+PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 9d4d11609ac2047aa8934cb2868f79359a816e12..ce5c6b701d95894db8e3a84215f537352914706a 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -21,9 +21,12 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 9aa206efed8c0111f56b6651e0228acc316b1bfe..3cecc52a3c481cf9cb4a1e2eba6ded704a8fa8ee 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -27,8 +27,14 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
+
+PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index e23342ebb5dc7639d68500964bfdfbd099d077cd..9e0e4e7fe1c6d26df7c4347d8bc81a985e6c973b 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -51,46 +53,6 @@ class EmptyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* context) const override {
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
-
-    if (context->HasInput("ShapeTensor")) {
-      auto shape_dims = context->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      context->SetOutputDim("Out", phi::make_ddim(vec_dims));
-    } else if (context->HasInputs("ShapeTensorList")) {
-      std::vector<int> out_dims;
-      auto dims_list = context->GetInputsDim("ShapeTensorList");
-      for (size_t i = 0; i < dims_list.size(); ++i) {
-        auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(dims, phi::make_ddim({1}),
-                          platform::errors::InvalidArgument(
-                              "The shape of Tensor in list must be [1]. "
-                              "But received the shape is [%s]",
-                              dims));
-
-        out_dims.push_back(-1);
-      }
-
-      context->SetOutputDim("Out", phi::make_ddim(out_dims));
-    } else {
-      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
-      for (size_t i = 0; i < shape.size(); ++i) {
-        PADDLE_ENFORCE_GE(
-            shape[i], 0,
-            platform::errors::InvalidArgument(
-                "Each value of attribute 'shape' is expected to be no less "
-                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
-                i, shape[i], phi::make_ddim(shape)));
-      }
-      context->SetOutputDim("Out", phi::make_ddim(shape));
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const framework::Tensor& tensor,
@@ -126,14 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OPERATOR(
-    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
-                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
+DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
+                            PD_INFER_META(phi::CreateInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker,
+                             ops::EmptyOpVarTypeInference,
+                             EmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc
deleted file mode 100644
index 22799e507aeff7940274f729b174f50bfd9132a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/empty_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/empty_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
-    ops::EmptyKernel<plat::CUDADeviceContext, int>,
-    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
-    ops::EmptyKernel<plat::CUDADeviceContext, float>,
-    ops::EmptyKernel<plat::CUDADeviceContext, double>,
-    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
deleted file mode 100644
index cb466fffcd7c7358b6e84c18b7895a17b2eaa907..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/empty_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EmptyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor *out_tensor = context.Output<Tensor>("Out");
-
-    auto shape = GetShape(context);
-    out_tensor->Resize(shape);
-
-    out_tensor->mutable_data(context.GetPlace(),
-                             framework::TransToPhiDataType(dtype));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc
index f68f670394871114369f8b05b7f958c03d5508d0..64274d098c0585c28196743c09d5e6c78c3fe37d 100644
--- a/paddle/fluid/operators/erf_op.cc
+++ b/paddle/fluid/operators/erf_op.cc
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/erf_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel {
         const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of ErfOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of ErfOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker,
                   ops::ErfGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ErfGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ErfGradOpMaker<paddle::imperative::OpBase>,
+                  ErfInferShapeFunctor);
 REGISTER_OPERATOR(erf_grad, ops::ErfGradOp);
-REGISTER_OP_CPU_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CPU_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h
deleted file mode 100644
index 4780b2e7f5b28d4a743f6d35046891b30cbefd00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/erf_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ErfKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErf<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
-                                                     eigen_in);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ErfGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(place, eigen_dx,
-                                                         eigen_x, eigen_dout);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
index 3d409b4c4f6772bc7b234208e78c5088eeb2fc00..374b00792622f91edc0b66cebb278cc79f30dc66 100644
--- a/paddle/fluid/operators/erfinv_op.cc
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(
     erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 119e514a49e28fb3295e36947664770889bbdd81..97a35a34f23e96707269482e29da13a15538cdca 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -121,37 +121,9 @@ REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
-#endif
 
 REGISTER_OP_VERSION(expand_as_v2)
     .AddCheckpoint(
         R"ROC(fix expand_as_v2 and add new input [Y])ROC",
         paddle::framework::compatible::OpVersionDesc().NewInput(
-            "Y", "Expand X according to the shape of Y"));
\ No newline at end of file
+            "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index d7560efc5c1f1244ae4eed4c68c59a38287057ee..f09e7764eed3959c7f0ca700b953dbd0c2891d12 100755
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -32,219 +32,5 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T>
-class ExpandAsV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank, rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
-    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank, MAX_RANK_SUPPORTED));
-
-    switch (target_rank) {
-      case 1:
-        ExpandAs<1>(context);
-        break;
-      case 2:
-        ExpandAs<2>(context);
-        break;
-      case 3:
-        ExpandAs<3>(context);
-        break;
-      case 4:
-        ExpandAs<4>(context);
-        break;
-      case 5:
-        ExpandAs<5>(context);
-        break;
-      case 6:
-        ExpandAs<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (i < diff) {
-        PADDLE_ENFORCE_GT(
-            target_shape[i], 0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_as_v2 op.",
-                target_shape[i]));
-        repeat_times[i] = target_shape[i];
-      } else if (target_shape[i] > 0) {
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i], target_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_as_v2 op.",
-                  vec_in_dims[i], target_shape[i]));
-          repeat_times[i] = 1;
-        } else {
-          repeat_times[i] = target_shape[i];
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(
-            target_shape[i], -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_as_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                target_shape[i]));
-        repeat_times[i] = 1;
-      }
-    }
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                 bcast_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto x_dims = in0->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      repeat_times[i] = target_shape[i] / vec_in_dims[i];
-    }
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
-                            out0);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be greater than or "
-                            "equal to 1, but the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_as_v2_grad op must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandAsBackward(const framework::ExecutionContext& context,
-                        const std::vector<int>& reshape_dims_vec,
-                        const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index cdd4e1dbaae6a6a74bb11be44589877234021764..df00ae54c1036b1b0f0899eb0a949d58c398aa48 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc
index ee456dcdafbc51d547e7beacc4e4e79f98738b88..1a48a6767852e138e7725a68ca4ffc56de8234be 100644
--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -76,7 +76,7 @@ class ExponentialKernel<platform::CPUDeviceContext, T>
     auto engine = gen->GetCPUEngine();
 
     std::uniform_real_distribution<T> uniform(0.0, 1.0);
-    distribution::exponential_transform<T> trans(lambda);
+    phi::funcs::exponential_transform<T> trans(lambda);
     for (int64_t i = 0; i < size; ++i) {
       out_data[i] = trans(uniform(*engine));
     }
diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu
index 8b989501e4f4248b0c2e3b23e1e75a4865b08588..d5abbf9a26afe6bcbbd8549f59d632fc4e53fec2 100644
--- a/paddle/fluid/operators/exponential_op.cu
+++ b/paddle/fluid/operators/exponential_op.cu
@@ -26,9 +26,9 @@ class ExponentialKernel<platform::CUDADeviceContext, T>
     auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
     T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
 
-    distribution::uniform_distribution<T> dist;
-    distribution::exponential_transform<T> trans(lambda);
-    distribution::distribution_and_transform<T>(dev_cxt, out, dist, trans);
+    phi::funcs::uniform_distribution<T> dist;
+    phi::funcs::exponential_transform<T> trans(lambda);
+    phi::funcs::distribution_and_transform<T>(dev_cxt, out, dist, trans);
   }
 };
 
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
index fbcabc594db0814da1ec50934a0f02514dc208be..7ded174a9f47ede48a49b19b25539867ce344fb0 100644
--- a/paddle/fluid/operators/exponential_op.h
+++ b/paddle/fluid/operators/exponential_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 8f8a0f174a79f13f0bee7aa7b425f8c645e15687..537c218d357b67980216ab3053707b8adb867c01 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,24 +24,6 @@ class EyeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of EyeOP should not be null."));
-    auto num_rows = ctx->Attrs().Get<int64_t>("num_rows");
-    PADDLE_ENFORCE_EQ(
-        num_rows >= 0, true,
-        platform::errors::InvalidArgument(
-            "The value of Input(num_rows) should be non-negative int."));
-    auto num_columns = ctx->Attrs().Get<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-    PADDLE_ENFORCE_EQ(
-        num_columns >= 0, true,
-        platform::errors::InvalidArgument(
-            "The value of Input(num_columns) should be non-negative int."));
-    ctx->SetOutputDim("Out", {num_rows, num_columns});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -82,8 +67,11 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
+                            PD_INFER_META(phi::EyeInferMeta));
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    EyeInferShapeFunctor);
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 0eb84f18f25f03b1fd0310c5815ee342ff835a6f..27a235765227f15dd412dcd6ad55f2a24471c6da 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/attn_feed_forward.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,6 +30,11 @@ namespace platform = paddle::platform;
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+#endif
+
 // get paddle matmul op results as baseline
 template <typename T>
 void GetLinearOp(const std::vector<T> &x, const std::vector<T> &y,
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 79018f2a97448a8c6265a969dad37bce77d1b7ee..cb03add3143278260d41c3893e7adad976908d4e 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
-#if (CANN_VERSION_CODE >= 503003)
+#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001)
       runner.SetType("FillD")
           .AddInput(tensor_value)
           .AddOutput(*out_var)
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7870efba4e7a1a285bbd4b28b04c2b15f263c347
--- /dev/null
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -0,0 +1,597 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
+
+#if defined(PADDLE_WITH_CUDA)
+#include <cooperative_groups.h>
+#endif
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <cstring>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/operators/filter_by_instag_op.h"
+
+#if defined(PADDLE_WITH_CUDA)
+namespace cg = cooperative_groups;
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+using Vector = framework::Vector<T>;
+
+#define WARP_SIZE 32
+#define MAX_WARP_NUM 32
+
+#if defined(PADDLE_WITH_CUDA)
+
+template <typename T>
+__global__ void filter_copy_fuse_kernel(
+    const size_t N, const int ins_per_thread, size_t* x1_lods_data,
+    size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data,
+    int64_t filter_tag_size, T* out_data, int64_t* map_data,
+    size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data,
+    const T* x1_data, int x1_embed_size, float* loss_weight_data,
+    float fill_value) {
+  // N is instance num
+  // one threads for ins_per_thread instances
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cg::thread_block b = cg::this_thread_block();
+  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+  int gid = idx / WARP_SIZE;
+
+  // general use
+  int thread_num =
+      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
+  int total_warp_num = thread_num / WARP_SIZE;      // 30
+  int remain_thread_num = thread_num % WARP_SIZE;   // 16
+
+  int warp_thread_num = -1;
+  if (gid < total_warp_num) {
+    warp_thread_num = WARP_SIZE;
+  } else {
+    warp_thread_num = remain_thread_num;
+  }
+
+  int group_num = total_warp_num;
+  if (remain_thread_num > 0) {
+    group_num = total_warp_num + 1;
+  }
+
+  if (gid >= group_num) return;
+
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (N < ins_end) ins_end = N;
+
+  int flag_data[5];
+  int prefix_sum_data[5];
+  int prefix_sum_data2[5];
+
+  __shared__ int shr[MAX_WARP_NUM];
+  __shared__ int shr2[MAX_WARP_NUM];
+  __shared__ int shr3[MAX_WARP_NUM];
+
+  for (int p = ins_start; p < ins_end; p++) {
+    int ins_tag_start = x2_lods_data[p];
+    int ins_tag_end = x2_lods_data[p + 1];
+    flag_data[p - ins_start] = 0;
+    // filter logic
+    int i = ins_tag_start;
+    for (; i < ins_tag_end; i++) {
+      int64_t ins_tag = x2_data[i];
+      int j = 0;
+      for (; j < filter_tag_size; j++) {
+        if (x3_data[j] == ins_tag) break;
+      }
+      // if ins_tag in filter tag
+      if (j < filter_tag_size) {
+        flag_data[p - ins_start] = 1;
+        break;
+      }
+    }
+  }
+
+  int sum_addr = 0;
+  int sum_flag = 0;
+  int sum_out_lods = 0;
+
+  int local_addr = 0;
+  int local_flag = 0;
+  int local_out_lods = 0;
+
+  if (ins_start < ins_end) {
+    for (int p = ins_start; p < ins_end; p++) {
+      int previous = -1;
+      if (p == ins_start) {
+        previous = 0;
+      } else {
+        previous = prefix_sum_data[p - ins_start - 1];
+      }
+
+      prefix_sum_data[p - ins_start] =
+          previous +
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
+    sum_addr = local_addr;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_flag += flag_data[p - ins_start];
+    }
+    sum_flag = local_flag;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_out_lods +=
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    sum_out_lods = local_out_lods;
+  }
+
+  for (int i = 1; i < warp_thread_num; i *= 2) {
+    int temp_addr = g.shfl_up(sum_addr, i);
+    int temp_flag = g.shfl_up(sum_flag, i);
+    int temp_out_lods = g.shfl_up(sum_out_lods, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr += temp_addr;
+      sum_flag += temp_flag;
+      sum_out_lods += temp_out_lods;
+    }
+  }
+
+  if (g.thread_rank() == warp_thread_num - 1) {
+    shr[gid] = sum_addr;
+    shr2[gid] = sum_flag;
+    shr3[gid] = sum_out_lods;
+  }
+
+  b.sync();
+
+  int sum_addr2 = 0;
+  int sum_flag2 = 0;
+  int sum_out_lods2 = 0;
+
+  // communicate between warp
+  if (g.thread_rank() < group_num) {
+    sum_addr2 = shr[g.thread_rank()];
+    sum_flag2 = shr2[g.thread_rank()];
+    sum_out_lods2 = shr3[g.thread_rank()];
+  }
+
+  for (int i = 1; i < group_num; i *= 2) {
+    int temp_addr2 = g.shfl_up(sum_addr2, i);
+    int temp_flag2 = g.shfl_up(sum_flag2, i);
+    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr2 += temp_addr2;
+      sum_flag2 += temp_flag2;
+      sum_out_lods2 += temp_out_lods2;
+    }
+  }
+
+  int sum_addr3 = g.shfl(sum_addr2, gid);
+  int sum_flag3 = g.shfl(sum_flag2, gid);
+  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
+
+  int p_flag;
+  int p_addr;
+  int p_out_lods;
+
+  if (ins_start < ins_end) {
+    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
+    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
+    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      if (ins_start == p) {
+        prefix_sum_data2[p - ins_start] = p_addr;
+      } else {
+        prefix_sum_data2[p - ins_start] =
+            prefix_sum_data2[p - ins_start - 1] +
+            flag_data[p - ins_start - 1] *
+                (x1_lods_data[p] - x1_lods_data[p - 1]);
+      }
+    }
+
+    if (gid == 0 && g.thread_rank() == group_num - 1) {
+      *out_idx_data = (sum_flag2 + 1);
+      map_lods_data[sum_flag2] = sum_flag2;
+    }
+  }
+
+  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
+
+  if (ins_start < ins_end) {
+    int out_lods_idx = p_flag + 1;
+    for (int p = ins_start; p < ins_end; p++) {
+      if (flag_data[p - ins_start] == 1) {
+        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
+        int t = out_lods_idx - 1;
+        int previous;
+        if (out_lods_idx == p_flag + 1) {
+          previous = p_out_lods;
+        } else {
+          previous = out_lods_data[t];
+        }
+        map_data[t * 3] = (int64_t)previous;
+        map_data[t * 3 + 1] = x1_lods_data[p];
+        map_lods_data[t] = t;
+        out_lods_data[out_lods_idx] = previous + batch_len;
+        map_data[t * 3 + 2] = batch_len;
+        out_lods_idx++;
+      }
+    }
+
+    // fill loss_weight_data
+    if (sum_out_lods4 > 1) {
+      int out_data_num = sum_out_lods4 - 1;
+      int out_start = ins_start;
+      if (out_start < out_data_num) {
+        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
+        for (int p = out_start; p < out_end; p++) {
+          loss_weight_data[p] = fill_value;
+        }
+      }
+    }
+
+    for (int p = ins_start; p < ins_end; p++) {
+      // copy logic
+      if (flag_data[p - ins_start] == 1) {
+        auto output_start_idx = prefix_sum_data2[p - ins_start];
+        T* dst = out_data + output_start_idx * x1_embed_size;
+        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
+        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
+        for (const T *j = src_start; j != src_end; dst++, j++) {
+          *dst = *j;
+        }
+      }
+    }
+  }
+
+  b.sync();
+}
+
+template <typename T>
+__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
+                                 const T* out_grad_data, T* x1_grad_data,
+                                 const int64_t* map_data, int x1_embed_size) {
+  // N is instance num
+  // one threads for one instance
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+  if (ins_start >= N) {
+    return;
+  }
+  if (ins_end > N) ins_end = N;
+  for (int p = ins_start; p < ins_end; p++) {
+    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
+    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
+    const T* src_end =
+        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
+
+    for (const T *j = src_start; j != src_end; dst++, j++) {
+      *dst = *j;
+    }
+  }
+}
+
+#endif
+
+template <typename T>
+class FilterByInstagGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+
+    int max_thread_num_per_block = 1024;
+    //    context.cuda_device_context().GetMaxThreadsPerBlock();
+    // X1 is global FC output
+    // Dim [batch size, embedding size]
+    const LoDTensor* x1 = context.Input<LoDTensor>("Ins");
+    bool is_lod = context.Attr<bool>("is_lod");
+
+    int is_x1_lod = -1;
+    if (is_lod)
+      is_x1_lod = 1;
+    else
+      is_x1_lod = 0;
+
+    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
+    size_t x1_embed_size = x1->dims()[1];
+    // X2 is ins tag list
+    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
+    const LoDTensor* x2 = context.Input<LoDTensor>("Ins_tag");
+    // expected auto = const int64_t
+    const int64_t* x2_data = x2->data<int64_t>();
+
+    // X3 is local fc tag list
+    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
+    const Tensor* x3 = context.Input<Tensor>("Filter_tag");
+    const int64_t* x3_data = x3->data<int64_t>();
+
+    Vector<size_t> x2_lods;
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_per_num = x2->dims()[1];
+      // x2_lods.resize(x2->dims()[0] + 1);
+      // move to cuda
+      x2_lods.push_back(0);
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_per_num);
+      }
+    }
+
+    const size_t x2_lods_size = x2_lods.size() - 1;
+    paddle::framework::MixVector<size_t> mixv_x2_lods(&x2_lods);
+
+    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
+
+    Vector<size_t> x1_lods;
+    if (!is_x1_lod) {
+      x1_lods.push_back(0);
+      for (int i = 0; i < x1->dims()[0]; i++) {
+        x1_lods.push_back(i + 1);
+      }
+    } else {
+      // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {  // lod_level = 1
+        x1_lods = x1->lod()[0];
+      } else {  // lod_level = 0
+        // x1_lods.resize(x1->dims()[0] + 1);
+        // move to cuda
+        x1_lods.push_back(0);
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
+    }
+
+    paddle::framework::MixVector<size_t> mixv_x1_lods(&x1_lods);
+
+    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
+    auto* x1_data = x1->data<T>();
+
+    // set output value
+    // for those whose ins been dropout, set 0 for whole lines.
+    // otherwise, copy whole line
+    // Dim [local fc count, batch size, embedding size]
+    LoDTensor* out = context.Output<LoDTensor>("Out");
+    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
+    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
+
+    int out_first = x1_lods.back();
+
+    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
+    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
+    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
+
+    T* out_data = out->mutable_data<T>(gpu_place);
+    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
+    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
+
+    int block_size = max_thread_num_per_block;
+    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
+    dim3 block_dim(block_size);
+    dim3 grid_dim(1);
+
+    Vector<size_t> out_lods(x2_lods_size + 1, 0);
+    Vector<size_t> map_lods(x2_lods_size + 1, 0);
+
+    paddle::framework::MixVector<size_t> mixv_out_lods(&out_lods);
+    paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+
+    // thrust::device_vector<size_t> out_idx(1);
+    Vector<size_t> out_idx(1, 0);
+    paddle::framework::MixVector<size_t> mixv_out_idx(&out_idx);
+
+    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
+    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
+    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
+
+    float fill_value = 1.0;
+
+    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+        x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data,
+        x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data,
+        out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value);
+
+    platform::GpuStreamSync(current_stream);
+
+    mixv_out_lods.resize(mixv_out_idx[0]);
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      out->Resize(phi::make_ddim(
+          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
+
+      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
+      loss_weight->Resize(
+          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
+
+    } else {
+      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
+      map->Resize(phi::make_ddim({1, 3}));
+      loss_weight->Resize(phi::make_ddim({1, 1}));
+    }
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      map_lods.resize(mixv_out_lods.size());
+
+      mixv_map_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+
+      map->set_lod(map_lod_info);
+      loss_weight->set_lod(map_lod_info);
+
+      mixv_out_lods.CopyToCPU();
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+    } else {
+      Vector<size_t> map_lods(2, 0);
+      paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+      thrust::device_ptr<int64_t> map_data_ptr(map_data);
+
+      map_data_ptr[0] = 0;
+      map_data_ptr[1] = 1;
+      map_data_ptr[2] = 1;
+
+      mixv_map_lods[0] = 0;
+      mixv_map_lods[1] = 1;
+      mixv_out_lods.push_back(1);
+
+      mixv_map_lods.CopyToCPU();
+      mixv_out_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+      map->set_lod(map_lod_info);
+
+      loss_weight->set_lod(map_lod_info);
+
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+      thrust::device_ptr<T> out_data_ptr(out_data);
+
+      // gpu kernel
+      if (std::is_same<T, int32_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int32_t>(out_val_if_empty));
+      } else if (std::is_same<T, int64_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int64_t>(out_val_if_empty));
+      } else if (std::is_same<T, float>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<float>(out_val_if_empty));
+      } else {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<double>(out_val_if_empty));
+      }
+
+      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
+      loss_weight_data_ptr[0] = 0;
+    }
+
+#endif
+  }
+};
+
+template <typename T>
+class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+    auto max_thread_num_per_block = 1024;
+    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
+    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
+    auto* mmap = context.Input<LoDTensor>("IndexMap");
+    auto* x1 = context.Input<LoDTensor>("Ins");
+
+    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
+    x1_grad->Resize(x1->dims());
+
+    auto* mmap_data = mmap->data<int64_t>();
+    // expected auto = T
+    auto* output_grad_data = output_grad->data<T>();
+    auto* loss_weight_data = loss_weight->data<float>();
+
+    // expected auto = T
+    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
+    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
+    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
+
+    thrust::fill(x1_grad_data_ptr,
+                 x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
+
+    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
+      auto output_dims = output_grad->dims();
+      int x1_embed_size = output_dims[1];
+
+      // one thread for multi-instances
+      int block_size = max_thread_num_per_block;
+
+      size_t N = mmap->dims()[0];
+      dim3 block_dim(block_size);
+
+      dim3 grid_dim((N + block_size - 1) / block_size);
+
+      const int ins_per_thread = 1;
+
+      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+          N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data,
+          x1_embed_size);
+
+      cudaStreamSynchronize(current_stream);
+    }
+
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel<float>,
+                        ops::FilterByInstagGPUKernel<double>,
+                        ops::FilterByInstagGPUKernel<int32_t>,
+                        ops::FilterByInstagGPUKernel<int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad,
+                        ops::FilterByInstagGradGPUKernel<float>,
+                        ops::FilterByInstagGradGPUKernel<double>,
+                        ops::FilterByInstagGradGPUKernel<int32_t>,
+                        ops::FilterByInstagGradGPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index deb2aa96b539e360cf2edad97b21cb6e9ddba066..3abc980ceaafc3719c13cad51c346282be2c694f 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    size_t x2_lods_size = x2->dims()[0];
+    // size_t x2_lods_size = x2->dims()[0];
+    // size_t instag_num_per_ins = x2->dims()[1];
+
+    Vector<size_t> x2_lods(1, 0);
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_num_per_ins = x2->dims()[1];
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
+      }
+    }
+
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
@@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods_size; i++) {
-      for (size_t j = i; j < i + 1; j++) {
+    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
+      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];
@@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
           out_data[oi] = (int32_t)out_val_if_empty;
         } else if (std::is_same<T, int64_t>::value) {
           out_data[oi] = (int64_t)out_val_if_empty;
-        } else {
+        } else if (std::is_same<T, double>::value) {
           out_data[oi] = static_cast<double>(out_val_if_empty);
+        } else {
+          out_data[oi] = static_cast<float>(out_val_if_empty);
         }
       }
       loss_weight_data[0] = 0;
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 5ef13b38c8a86e16cefdc97be6934b313fdb7bc4..feae954e355b85f5a18f8a48919770fd46a73f70 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 40ec9aef190ff4bacd52b19a1c0b12300a35b61e..92f59e118c3b7bb66a2c5c76d66109ddf04ee076 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel {
                           "but recieved strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
+    PADDLE_ENFORCE_GT(output_height, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_height` should be greater than one, "
+                          "but recieved output_height: %d .",
+                          output_height));
+    PADDLE_ENFORCE_GT(output_width, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_width` should be greater than one, "
+                          "but recieved output_width: %d .",
+                          output_width));
+    // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
@@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel {
             output_width));
 
     PADDLE_ENFORCE_EQ(
-        blocks_height * blocks_width, in_dims[1],
+        blocks_height * blocks_width, in_dims[2],
         platform::errors::InvalidArgument(
             "Given input output_size (%d, %d), "
             "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
@@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel {
             strides[0], strides[1], dilations[0], dilations[1], blocks_height,
             blocks_width, blocks_height * blocks_width, in_dims[2]));
 
+    PADDLE_ENFORCE_EQ(
+        in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0,
+        platform::errors::InvalidArgument(
+            "Expected size of input's dimension 1 to be divisible by the"
+            "product of kernel_size, but got input.size(1)=%d and "
+            "kernel_size=( %d"
+            ", %d).",
+            in_dims[1], kernel_sizes[0], kernel_sizes[1]));
+
     out_dims.push_back(output_height);
     out_dims.push_back(output_width);
     ctx->SetOutputDim("Y", phi::make_ddim(out_dims));
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 67287afa6ae5059f8af3dcdbd6910ca35db7c3c0..80e7f5c001d4b8139b538570c42fcd8d2604961b 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -19,7 +19,8 @@ register_operators(EXCLUDES
     fused_attention_op
     fused_transformer_op
     fused_feedforward_op
-    resnet_unit_op)
+    resnet_unit_op
+    fused_gemm_epilogue_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM)
         cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
         cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
     endif()
+
+    if (CUDA_VERSION GREATER_EQUAL 11.6)
+        op_library(fused_gemm_epilogue_op)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 20801d2243fb395b250f8416f1e2f5ba6a1423a4..3a2de0c4a093514a1c40321ab7dad61011709204 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary(
 template <typename T>
 void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
                            const T* in0, const T* in1, T* out) {
-  int in_vec_size = std::min(platform::GetVectorizedSize<T>(in0),
-                             platform::GetVectorizedSize<T>(in1));
-  int out_vec_size = std::min(4, platform::GetVectorizedSize<T>(out));
+  int in_vec_size =
+      std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1));
+  int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out));
   int vec_size = std::min(out_vec_size, in_vec_size);
 
   int numel = m * n;
@@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
 
   int num_block = (max_threads / left_num);
   if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-    *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block);
+    *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block);
     if (*blocking_size <= 1) {
-      *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num));
+      *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num));
     } else if (*blocking_size * 2 < reduce_num) {
       *blocking_size *= 2;
     }
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index bb5b363fe83995faf69f61b0a1a1693ff758fa37..5dbf4fb88b2a78838ce0fe95be653f68f4805416 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                              in_data_dims, strides, ksize);
 
     int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
     Tensor transformed_input;
     std::vector<int> padding_common(data_dim, 0);
@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
               &transformed_input);
         } break;
         default:
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 6119af18ce153ac2bcd5d45a69ab7b5d86a3cc10..b3ac3606eaf8ee843a2be98b7a237037afaf524f 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 1864bdbb86667290474d297cc481f5d6352c8022..a80f590aa495db8090a30118ed4128843c0f8860 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
@@ -29,10 +30,10 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(conv2d);
-USE_OP(conv2d_grad);
-USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
-USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN);
+USE_OP_ITSELF(conv2d);
+USE_OP_ITSELF(conv2d_grad);
+PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT);
+PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
 
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
@@ -404,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 3, output_channels = input_channels
@@ -420,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, output_channels = input_channels * 4
@@ -436,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 020277675797358bf87a58ac108e6eaaddb26ccc..54e4cbdc1624921e6946210a6a192d10fcbdb7dd 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -69,20 +70,21 @@ class FMHARef {
   ~FMHARef() {}
 
   void ComputeForward(const Tensor& qkv_input_tensor,
+                      const Tensor* cache_kv_tensor,
                       const Tensor* src_mask_tensor,
-                      Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor,
+                      Tensor* transpose_2_out_tensor,
+                      Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor,
                       Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
                       Tensor* dropout_mask_out_tensor,
                       Tensor* dropout_out_tensor, Tensor* qktv_out_tensor,
                       Tensor* fmha_out_tensor) {
     // input shape: [bs, seq_len, 3, num_head, head_dim]
-    // transpose with perm [2, 0, 1, 3, 4],
+    // transpose with perm [2, 0, 3, 1, 4],
     // output_shape: [3, bs, num_head, seq_len, head_dim]
     int ndims = 5;
     std::vector<int> perm_1 = {2, 0, 3, 1, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_input_tensor, perm_1,
                                 transpose_2_out_tensor);
-
     T* qkv_data = transpose_2_out_tensor->data<T>();
     T* qk_out_data = qk_out_tensor->data<T>();
     T* qktv_out_data = qktv_out_tensor->data<T>();
@@ -90,11 +92,30 @@ class FMHARef {
     T* dropout_out_data = dropout_out_tensor->data<T>();
     T* fmha_out_data = fmha_out_tensor->data<T>();
 
-    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
-    int k_size = q_size;
+    auto out_seq_len = seq_len_;
+    if (cache_kv_tensor) {
+      // kv [2, bs, num_head, seq_len, head_dim]
+      auto kv_tensor = transpose_2_out_tensor->Slice(1, 3);
+      phi::funcs::ConcatFunctor<phi::GPUContext, T> concat;
+      // out [2, bs, num_head, cache_seq_len + seq_len, head_dim]
+      concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor);
+      out_seq_len = cache_kv_out_tensor->dims()[3];
+    }
+
+    int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     T* q_ptr = qkv_data;
-    T* k_ptr = q_ptr + q_size;
-    T* v_ptr = k_ptr + k_size;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+
+    if (cache_kv_tensor) {
+      int64_t k_size = cache_kv_out_tensor->numel() / 2;
+      k_ptr = cache_kv_out_tensor->data<T>();
+      v_ptr = k_ptr + k_size;
+    } else {
+      int64_t k_size = q_size;
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + k_size;
+    }
 
     // q*k^t, batched_gemm
     CBLAS_TRANSPOSE transA = CblasNoTrans;
@@ -102,7 +123,7 @@ class FMHARef {
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     int gemm_batch_size = batch_size_ * num_head_;
     int gemm_m = seq_len_;
-    int gemm_n = seq_len_;
+    int gemm_n = out_seq_len;
     int gemm_k = head_dim_;
     T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
     T beta = static_cast<T>(0.0);
@@ -133,16 +154,16 @@ class FMHARef {
     transB = CblasNoTrans;
     gemm_m = seq_len_;
     gemm_n = head_dim_;
-    gemm_k = seq_len_;
+    gemm_k = out_seq_len;
     alpha = static_cast<T>(1.0);
     stride_a = gemm_m * gemm_k;
     stride_b = gemm_k * gemm_n;
 
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
-          dev_ctx_, dropout_param_.is_test_,
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          dropout_param_.is_test_, static_cast<const std::string>(
+                                       dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
@@ -242,8 +263,9 @@ class FMHARef {
     // dropout bw
     if (dropout_param_.dropout_prob_) {
       DropoutGradGPUKernelDriver<T>(
-          dev_ctx_, static_cast<const std::string>(
-                        dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_,
           static_cast<const Tensor&>(*dropout_out_grad_tensor),
           dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index d141800d61c0ec0b73fe2cc3c8d00dbf1de44cf2..e473f8ff0662cfc3fd7bdc5010bfa1dc08fba85f 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut",
                    "FusedAttentionOp");
 
+    if (ctx->HasInput("CacheKV")) {
+      OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut",
+                     "FusedAttentionOp");
+    }
     if (ctx->HasInput("SrcMask")) {
       OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut",
                      "FusedAttentionOp");
@@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                           "input qkv_weight = [%s]",
                           x_dim, y_dim));
 
-    PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
-                      platform::errors::InvalidArgument(
-                          "The dimensions of qkv_weight must be 4"
-                          "(3, num_head, dim_head, dim_embed),"
-                          "and must satisfy the limitations: "
-                          "(num_head * dim_head == dim_embed)"));
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
+                        platform::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(3, num_head, dim_head, dim_embed),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+    }
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
       ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
@@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // [3, batch_size, num_head, seq_len, head_size]
     ctx->SetOutputDim("TransposeOut2",
                       {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
-    // [batch, num_head, seq_len, seq_len]
-    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+
+    // cache_seq_len + seq_len if cache else seq_len
+    auto out_seq_len = x_dim[1];
+    if (ctx->HasInput("CacheKV")) {
+      // [2, batch_size, num_head, cache_seq_len, head_size]
+      auto c_dim = ctx->GetInputDim("CacheKV");
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(), 5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0], 2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0], c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            y_dim[1], c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GE(
+          c_dim[3], 0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            y_dim[2], c_dim[4]));  // head_size
+
+      out_seq_len += c_dim[3];
+      // [3, batch_size, num_head, cache_seq_len + seq_len, head_size]
+      ctx->SetOutputDim("CacheKVOut",
+                        {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]});
+    }
+
+    // [batch, num_head, seq_len, out_seq_len]
+    ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
 
     if (ctx->HasInput("SrcMask")) {
-      ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+      ctx->SetOutputDim("SrcMaskOut",
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
-                      {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
-                        {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+                        {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
-    ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+    ctx->SetOutputDim("SoftmaxOut",
+                      {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     // [batch_size, num_heads, seq_len, head_dim]
     ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
     // [batch_size, seq_len, number of heads*head size]
@@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddInput("QKVW", "The qkv weight tensor.");
     AddInput("QKVBias", "The qkv bias tensor.").AsDispensable();
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable();
     AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
         .AsDispensable();
     AddInput("OutLinearW", "The out_linear weight tensor.");
@@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("BiasDropoutResidualOut",
               "Result of residual + dropout(src + bias).")
         .AsIntermediate();
+    AddOutput("CacheKVOut", "The udpated cache KV.");
     AddOutput("Y", "Result after attention.");
 
     AddAttr<bool>("pre_layer_norm",
@@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "0.0 and 0.001, But received [%s].",
                                 ln_epsilon));
         });
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
 
     AddComment(R"DOC(
   Add fused attention op whose logic is as follows:
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 03f51fc5857985902c21ad12fefbdc9cdec6ef04..d26577f06fe683fb1528c61b4401b9e578c90c9f 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -27,11 +27,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor &tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext &ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void *sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void *recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename T>
 class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
@@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     auto *src_mask = ctx.Input<Tensor>("SrcMask");
     auto *transpose_out_2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *cache_kv = ctx.Input<Tensor>("CacheKV");
+    auto *cache_kv_out = ctx.Output<Tensor>("CacheKVOut");
     auto *qk_out = ctx.Output<Tensor>("QKOut");
     auto *qktv_out = ctx.Output<Tensor>("QKTVOut");
     auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
@@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // final output.
     auto *out = ctx.Output<Tensor>("Y");
@@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // get data ptr for FMHA.
     auto *transpose_out_2_data =
         transpose_out_2->mutable_data<T>(ctx.GetPlace());
+    auto *cache_kv_out_data =
+        (cache_kv_out == nullptr)
+            ? nullptr
+            : cache_kv_out->mutable_data<T>(ctx.GetPlace());
     auto *qk_out_data = qk_out->mutable_data<T>(ctx.GetPlace());
     auto *qktv_out_data = qktv_out->mutable_data<T>(ctx.GetPlace());
     auto *src_mask_out_data =
@@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     output_size = hidden_size;
     // (transA, transB, compute_bias) = (false, false, false)
+    // NOTE(Yuang Liu): For general input size == output size, change the
+    // position won't have effects. For mp, the output size is mp_head * dkey
+    // which is actually the input size. While the input size is hidden size,
+    // which is actually the output size. So for out linear, switch the
+    // input size and output size.
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), false, false, bsz_seq,
-                      output_size, input_size, false);
+                      input_size, output_size, false);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
                                  qkv_bias_out);
     }
     if (qkv_bias == nullptr) {
-      fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out,
+          src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out,
+          qktv_out, fmha_out);
     } else {
-      fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2,
-                                      qk_out, src_mask_out, softmax_out,
-                                      attn_dropout_mask_out, attn_dropout_out,
-                                      qktv_out, fmha_out);
+      fmha_ref_compute.ComputeForward(
+          *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out,
+          qk_out, src_mask_out, softmax_out, attn_dropout_mask_out,
+          attn_dropout_out, qktv_out, fmha_out);
     }
 
     // fmha_out: [batch_size, seq_len, num_head, head_dim]
@@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     // out_linear_out: [batch_size, seq_len, embed_dim]
     out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr,
                                       out_linear_out, nullptr);
+    // tensor model parallel
+    AllReduce<T>(*out_linear_out, ring_id, ctx.cuda_device_context());
+
     if (pre_layer_norm) {
       // output = (residual + dropout(input + bias))
       fused_dropout_layernorm_helper.ResidualDropoutBias(
@@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+    int ring_id = ctx.Attr<int>("ring_id");
 
     // get inputs.
     auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
@@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     transA = false;
     transB = false;
     bool compute_bias = false;
+    // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed)
     auto out_linear_compute =
         AttnMatMul<T>(ctx.cuda_device_context(), transA, transB, bsz_seq,
-                      output_size, input_size, compute_bias);
+                      input_size, output_size, compute_bias);
     DropoutParam dropout_param2(ctx, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_ln_out, ring_id, ctx.cuda_device_context());
       layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data,
                                          ln_mean_data, ln_var_data, d_x_data,
                                          d_ln_scale_data, d_ln_bias_data);
@@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x,
                                     d_qkv_weight, d_qkv_bias);
       }
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx.cuda_device_context());
     }
     // gradient accumulation
     std::vector<const Tensor *> ins;
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
index 994601a2f0608b4fc04966c7549c421f395f3ec7..9f5a1bad047b44b715e11e74d92fdca1982c96f8 100755
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
                                     const T factor, const int64_t size, T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_vec;
     LoadT src_vec;
     MaskLoadT mask_vec;
 
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
-    platform::Load<T, VecSize>(&src[i], &src_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&src[i], &src_vec);
 
     StoreT dx_vec;
 #pragma unroll
@@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
       T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
       dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
     }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
                                         T *dx, T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
   if (col_id * VecSize < cols) {
@@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
       LoadT bias_vec;
       MaskLoadT mask_vec;
 
-      platform::Load<T, VecSize>(&dout[index], &dout_vec);
-      platform::Load<T, VecSize>(&src[index], &src_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
-      platform::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
+      phi::Load<T, VecSize>(&dout[index], &dout_vec);
+      phi::Load<T, VecSize>(&src[index], &src_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
 
       StoreT dx_vec;
 #pragma unroll
@@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
index 2381b5b7fdfb85cbaa3fd66a10c5b630bb515f15..717c1732b7b3acf8528887aae43471c0dc0716e3 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -20,8 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index f79277e4e8f0d22cedafc9f7b40b56ecd2d6a817..6bf3a7114f4ced3c7c6ecd1f1afeca60ff66528f 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index d7952df470d81566c3833e79e8cfa31a7d2dc68c..18c7187fc8e64c9fed8a86a984954b5420c1e5b5 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -31,7 +31,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 USE_OP(layer_norm);
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 0c83c36b47583d49f022cb103894592d3a1b4ae7..7308f30779248e64f55e10b0661d2c98d263416c 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 #include "paddle/phi/kernels/funcs/sequence2batch.h"
 
 namespace paddle {
@@ -243,12 +243,12 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
   auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
   if (platform::MayIUse(platform::avx)) {                                      \
-    math::VecActivations<T, platform::avx> act_functor;                        \
+    phi::funcs::VecActivations<T, platform::avx> act_functor;                  \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
   } else {                                                                     \
-    math::VecActivations<T, platform::isa_any> act_functor;                    \
+    phi::funcs::VecActivations<T, platform::isa_any> act_functor;              \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 0c8eae4260441f6c873b48735a01b967b70ef4bb..f3f8f1742757783a082437638f67407700963eb1 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<int>("dropout1_seed", "Dropout1 random seed.").SetDefault(0);
     AddAttr<int>("dropout2_seed", "Dropout2 random seed.").SetDefault(0);
+    AddAttr<int>("ring_id", "ring id for tensor model parallel.")
+        .SetDefault(-1);
     AddComment(R"DOC(
         the function of fused_feedforward operator is the same as the following pseudo code:
         residual = src;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 3131269955bdd17a0552836121589d8edeb4a38e..c38d9f7d4bcbd25b3111b35a918de0f4ebdabefb 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -21,11 +21,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor& tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext& ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void* sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void* recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class FusedFeedForwardKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
            framework::Tensor* dropout1_out, framework::Tensor* dropout2_out,
            const int bsz_seq, const int d_model, const int dim_feedforward,
            const std::string& act_method, const bool pre_layer_norm,
-           const float epsilon1, const float epsilon2,
+           const float epsilon1, const float epsilon2, const int ring_id,
            const DropoutParam& dropout_param1,
            const DropoutParam& dropout_param2,
            const platform::CUDADeviceContext& ctx) const {
@@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     framework::Tensor linear2_out;
     linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
     MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
+
+    // tensor model parallel
+    AllReduce<T>(linear2_out, ring_id, ctx);
+
     if (!pre_layer_norm) {
       fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
           ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
@@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
 
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
         dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance,
         linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model,
         dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2,
-        dropout_param1, dropout_param2, context.cuda_device_context());
+        ring_id, dropout_param1, dropout_param2, context.cuda_device_context());
   }
 };
 
@@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       const int dim_feedforward, const DropoutParam& dropout_param1,
       const DropoutParam& dropout_param2, const std::string& act_method,
       const bool pre_layer_norm, const float epsilon1, const float epsilon2,
-      const platform::CUDADeviceContext& ctx) const {
+      const int ring_id, const platform::CUDADeviceContext& ctx) const {
     FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
         bsz_seq, d_model, epsilon1);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
@@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
       MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out,
                  d_linear1_weight);
-
+      // tensor model parallel
+      AllReduce<T>(d_ln1_out, ring_id, ctx);
       pre_layernorm_helper.LayerNormGrad(
           ctx, d_ln1_out.data<T>(), x.data<T>(), ln1_gamma_ptr,
           ln1_mean->data<U>(), ln1_variance->data<U>(), d_x->data<T>(),
           d_ln1_gamma_ptr, d_ln1_beta_ptr);
     } else {
       MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx);
     }
     std::vector<const Tensor*> ins(2);
     std::vector<Tensor*> outs(1);
@@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
     const std::string act_method = context.Attr<std::string>("act_method");
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
             d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale,
             d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model,
             dim_feedforward, dropout_param1, dropout_param2, act_method,
-            pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context());
+            pre_layer_norm, epsilon1, epsilon2, ring_id,
+            context.cuda_device_context());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e3661e6d6edc5ea95b77cd283cc99afcca8ed
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -0,0 +1,353 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias",
+                   "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedGemmEpilogueOp");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto bias_dims = ctx->GetInputDim("Bias");
+
+    auto trans_x = ctx->Attrs().Get<bool>("trans_x");
+    auto trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        bias_dims.size(), 1,
+        platform::errors::InvalidArgument(
+            "The Input tensor bias's dimension of FusedGemmEpilogueOp "
+            " should be == 1, but got %d.",
+            bias_dims.size()));
+
+    PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The Input tensor bias's dimension 0"
+                          " should be == Y[-1], but got bias's shape = [%s] "
+                          "and Y's shape = [%s]",
+                          bias_dims, y_dims));
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
+
+    int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1];
+    int K_from_y = trans_y ? y_dims[1] : y_dims[0];
+
+    PADDLE_ENFORCE_EQ(
+        K_from_x, K_from_y,
+        platform::errors::InvalidArgument(
+            "The last dimension of X should be equal with Y's first dimension."
+            "But received X[-1] = [%d], Y[0] = [%d].",
+            K_from_x, K_from_y));
+
+    auto activation = ctx->Attrs().Get<std::string>("activation");
+
+    if ((activation != "relu") && (activation != "gelu") &&
+        (activation != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+
+    if (activation == "none" && ctx->HasOutput("ReserveSpace")) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The ReserveSpace would not be used when activation = \"none\""));
+    }
+
+    // cublasLt's restriction for auxiliary.
+    if (ctx->HasOutput("ReserveSpace") && activation != "none") {
+      int min_size_of_n = activation == "relu" ? 128 : 8;
+      int N_size = trans_y ? y_dims[0] : y_dims[1];
+      PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0,
+                        platform::errors::InvalidArgument(
+                            "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) "
+                            "should be multiple of %d when auxiliary_key given "
+                            "and activation=%s, but got N = %d.",
+                            min_size_of_n, activation, N_size));
+    }
+
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(static_cast<size_t>(x_dims.size()));
+    if (trans_x) {
+      for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]);
+    } else {
+      for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]);
+    }
+
+    if (trans_y) {
+      out_dims.push_back(y_dims[0]);
+    } else {
+      out_dims.push_back(y_dims[1]);
+    }
+
+    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    // Note (Ming Huang): Reserve space of relu is a bit-mask,
+    // which cannot pass nan_and_inf checking if shape is set.
+    if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
+      ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias).");
+    AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias).");
+    AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias).");
+
+    AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias).");
+    AddOutput("ReserveSpace",
+              R"DOC(Reserve GPU space to place 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable()
+        .AsExtra();
+
+    AddAttr<bool>(
+        "trans_x",
+        R"DOC((bool, default false), Whether to transpose input tensor X 
+    or not. The input tensor X coulbe be more than two dimension. When 
+    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+    [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trans_y",
+        R"DOC((bool, default false), Whether to transpose input tensor Y 
+    or not. The input tensor Y should be two dimension. When 
+    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+    [d0, d1] -> [d1, d0].)DOC")
+        .SetDefault(false);
+
+    AddAttr<std::string>(
+        "activation",
+        R"DOC((string, default none), The activation function. It could be 
+    one of {none, relu, gelu}. When none is given, Act would be null 
+    operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogue Operator
+This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)).
+It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU).
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut",
+                   "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp");
+
+    auto dout_dims = ctx->GetInputDim("DOut");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_GE(
+        dout_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            dout_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueGradOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        dout_dims.size(), x_dims.size(),
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's and X's dimension of "
+            "FusedGemmEpilogueGradOp "
+            " should be the same, but got DOut's dim = %d and X's = %d.",
+            dout_dims.size(), x_dims.size()));
+
+    auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1);
+
+    auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[1], y_dims[1],
+        platform::errors::InvalidArgument(
+            "The last dimension of DOut should be equal with Y's last"
+            "dimension. But received DOut[-1] = [%d], Y[1] = [%d].",
+            dout_mat_dims[1], y_dims[1]));
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[0], x_mat_dims[0],
+        platform::errors::InvalidArgument(
+            "The first dimension of DOut should be equal with X's first"
+            "dimension. But received DOut[0] = [%d], Y[0] = [%d].",
+            dout_mat_dims[0], x_mat_dims[0]));
+
+    auto activation_grad = ctx->Attrs().Get<std::string>("activation_grad");
+    if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") &&
+        (activation_grad != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation_grad));
+    }
+
+    if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) {
+      PADDLE_ENFORCE_EQ(true, false,
+                        platform::errors::InvalidArgument(
+                            "The ReserveSpace should not be empty. "
+                            "when activation_grad == {relu_grad, gelu_grad}."));
+    }
+
+    if (ctx->HasOutput("DX")) {
+      std::vector<int64_t> dx_dims;
+      dx_dims.reserve(static_cast<size_t>(x_dims.size()));
+      for (int i = 0; i < x_dims.size(); ++i) {
+        dx_dims.push_back(x_dims[i]);
+      }
+      ctx->SetOutputDim("DX", phi::make_ddim(dx_dims));
+    }
+
+    std::vector<int64_t> dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size());
+    ctx->SetOutputDim("DY", phi::make_ddim(dy_dims));
+
+    if (ctx->HasOutput("DBias")) {
+      std::vector<int64_t> dbias_dims;
+      dbias_dims.push_back(y_dims[1]);
+      ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("DOut",
+             "The input grad tensor to Out of Out = (Act(X) * Y) + bias");
+    AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias");
+    AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias");
+    AddInput("ReserveSpace",
+             R"DOC(A GPU space to fetch 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue_grad op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable();
+
+    AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+    AddOutput("DY",
+              "The output grad tensor to Y of Out = (Act(X) * Y) + bias.");
+    AddOutput("DBias",
+              "The output grad tensor to bias of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+
+    AddAttr<std::string>(
+        "activation_grad",
+        R"DOC((string, default none), The backward activation function. It could be 
+    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would 
+    be null operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogueGrad Operator
+This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias).
+It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear.
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
+                  ops::FusedGemmEpilogueOpMaker)
+REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp,
+                  ops::FusedGemmEpilogueGradOpMaker)
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e16c9e8f483ccc2cbf1d7006159cccfe906dd06b
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* bias = ctx.Input<Tensor>("Bias");
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::string activation = ctx.Attr<std::string>("activation");
+    bool enable_auxiliary = reserve_space == nullptr ? false : true;
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto* out_data = out->data<T>();
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
+    int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
+    int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtMatmulDesc_t operation_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+        &operation_desc, compute_type, scale_type));
+    cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx,
+            sizeof(transx)));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy,
+            sizeof(transy)));
+
+    cublasLtEpilogue_t epiloque_func =
+        get_epilogue_type_(activation, enable_auxiliary);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func,
+            sizeof(epiloque_func)));
+    const T* bias_data = bias->data<T>();
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data,
+            sizeof(bias_data)));
+
+    if (enable_auxiliary && activation != "none") {
+      size_t reserve_space_size = 0;
+      if (activation == "relu") {
+        // Count in bits.
+        reserve_space_size = phi::product(out->dims()) / 8;
+      } else {
+        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+      }
+      reserve_space->mutable_data(ctx.GetPlace(), out->type(),
+                                  reserve_space_size);
+      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+              &aux_data, sizeof(aux_data)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+              sizeof(N)));
+    }
+
+    cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
+    if (trans_x)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, M, K, M));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+    if (trans_y)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, K, N, K));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &out_desc, mat_type, N, M, N));
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+    memory::allocation::AllocationPtr workspace =
+        memory::Alloc(dev_ctx, workspace_size);
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+        lt_handle, operation_desc, alpha, y->data<T>(), y_desc, x->data<T>(),
+        x_desc, beta, out_data, out_desc, out_data, out_desc, algo,
+        workspace->ptr(), workspace_size, stream));
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation,
+                                               bool enable_auxiliary) {
+    if (activation == "relu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_RELU_BIAS;
+    } else if (activation == "gelu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_GELU_BIAS;
+    } else if (activation == "none") {
+      return CUBLASLT_EPILOGUE_BIAS;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* dout = ctx.Input<Tensor>("DOut");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    Tensor* dx = ctx.Output<Tensor>("DX");
+    Tensor* dy = ctx.Output<Tensor>("DY");
+    Tensor* dbias = ctx.Output<Tensor>("DBias");
+
+    std::string activation_grad = ctx.Attr<std::string>("activation_grad");
+
+    auto dout_mat_dims =
+        phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1);
+    auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1);
+
+    int64_t M = x_mat_dims[0];
+    int64_t K = y->dims()[0];
+    int64_t N = y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    cublasOperation_t trans_dout = CUBLAS_OP_N;
+    cublasLtMatrixLayout_t dout_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &dout_desc, mat_type, N, M, N));
+
+    if (dx) {
+      cublasLtMatmulDesc_t dx_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dx_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_y = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y,
+              sizeof(trans_y)));
+      cublasLtEpilogue_t epiloque_func_for_dx =
+          get_epilogue_type_(activation_grad);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dx, sizeof(epiloque_func_for_dx)));
+
+      if (activation_grad != "none") {
+        auto* aux_data = reserve_space->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+                &aux_data, sizeof(aux_data)));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+                sizeof(N)));
+      }
+
+      cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dx_desc, mat_type, K, M, K));
+
+      memory::allocation::AllocationPtr dx_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dx->mutable_data<T>(ctx.GetPlace());
+      auto* dx_data = dx->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dx_operation_desc, alpha, y->data<T>(), y_desc,
+          dout->data<T>(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc,
+          algo, dx_workspace->ptr(), workspace_size, stream));
+    }
+
+    if (dy) {
+      cublasLtMatmulDesc_t dy_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dy_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_x = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x,
+              sizeof(trans_x)));
+      cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr
+                                                    ? CUBLASLT_EPILOGUE_DEFAULT
+                                                    : CUBLASLT_EPILOGUE_BGRADA;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dy, sizeof(epiloque_func_for_dy)));
+
+      if (dbias) {
+        dbias->mutable_data<T>(ctx.GetPlace());
+        auto* dbias_data = dbias->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                &dbias_data, sizeof(dbias_data)));
+      }
+
+      cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dy_desc, mat_type, N, K, N));
+
+      memory::allocation::AllocationPtr dy_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dy->mutable_data<T>(ctx.GetPlace());
+      auto* dy_data = dy->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dy_operation_desc, alpha, dout->data<T>(), dout_desc,
+          x->data<T>(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo,
+          dy_workspace->ptr(), workspace_size, stream));
+    }
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(
+      const std::string& activation_grad) {
+    if (activation_grad == "relu_grad") {
+      return CUBLASLT_EPILOGUE_DRELU;
+    } else if (activation_grad == "gelu_grad") {
+      return CUBLASLT_EPILOGUE_DGELU;
+    } else if (activation_grad == "none") {
+      return CUBLASLT_EPILOGUE_DEFAULT;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation_grad attribute of fused_gemm_epilogue op should "
+              "be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation_grad=%s.",
+              activation_grad));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDA_VERSION >= 11060
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue_grad,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index ceba3accca7727b5e4f22951d87f9e91034e3403..d53a24a57e3cc1ede127f497a9be9e3b5fa1ab0b 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -42,12 +42,12 @@ __device__ void CalcLayernormY(
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
     T *y, const int row_id, const int col_id, const int cols,
     const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using LoadU = platform::AlignedVector<U, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using LoadU = phi::AlignedVector<U, VecSize>;
   using LoadScaleOrBias =
-      platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                              VecSize>;
+      phi::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                         VecSize>;
   for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
     LoadScaleOrBias scale_vec;
     LoadScaleOrBias bias_vec;
@@ -60,15 +60,15 @@ __device__ void CalcLayernormY(
           static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
     }
     // vectorize load data from global
-    platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
+    phi::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
 
     if (scale != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&scale[i], &scale_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &scale[i], &scale_vec);
     }
     if (bias != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&bias[i], &bias_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &bias[i], &bias_vec);
     }
 
     StoreT y_vec;
@@ -78,7 +78,7 @@ __device__ void CalcLayernormY(
                              (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
                          static_cast<U>(bias_vec[ii]));
     }
-    platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
+    phi::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
   }
 }
 
@@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     Vec residual[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
-      platform::Load<T, VecSize>(
-          residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &residual[it]);
       col += THREADS_PER_ROW;
     }
 
@@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 // store dropout_residual_out and mask_out
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(
+      phi::Store<T, VecSize>(
           x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
-      platform::Store<MaskType, VecSize>(
+      phi::Store<MaskType, VecSize>(
           mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
@@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index cc14d0680d381ff2bbe73ee712e218c9c4d79185..032440d7f0478dc087e3ba38274f2a31a9a66a23 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 /**
  * @brief The unit test of fused_layernorm_residual_dropout_bias
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 1b135ad6098e58f457f5d21e73ac6d1a6a7c4074..1d3085a013f81ee9dca21468476df8f621bb26c2 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test,
     typename details::MPTypeTrait<T>::Type *mean_val,
     typename details::MPTypeTrait<T>::Type *var_val, Functor act_func) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
   using U = typename details::MPTypeTrait<T>::Type;
 
   LoadT src_vec;
@@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     residual_vec[ii] = static_cast<T>(0);
   }
   // vectorize load data from global
-  platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
   if (residual) {
-    platform::Load<T, VecSize>(&residual[row_id * cols + col_id],
-                               &residual_vec);
+    phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
   }
 
   if (bias) {
-    platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
+    phi::Load<T, VecSize>(&bias[col_id], &bias_vec);
   }
 
   MaskStoreT mask_vec;
@@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
   }
 
   // store result to global
-  platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
   if (!is_test) {
-    platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
+    phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
   }
 }
 
@@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
                                          T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     LoadT dout_vec;
     MaskLoadT mask_vec;
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
 
     StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
       dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
     }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
                                              T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
@@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
       LoadT out_vec;
       MaskLoadT mask_vec;
       StoreT dx_vec;
-      platform::Load<T, VecSize>(&dout[index], &out_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&dout[index], &out_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
 
 #pragma unroll
       for (int i = 0; i < VecSize; i++) {
@@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
         tmp_sum[i] += out_vec[i];
       }
 
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 1a12e6b565f02035b3fb9673636c2344823f288e..5dff5e2225f4f3bf3a20daa02b2b4194bd8cb99e 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -19,6 +19,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT);
+#endif
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 88fb7349d538afd6d7bf4fa6947ac21307db66d8..1000d0488dc3ffcf6cde977be47ce77d2bc947a7 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
 
 namespace paddle {
 namespace operators {
@@ -196,10 +196,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
     std::function<void(const int, const T*, T*)> fc_act;
     auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
     if (platform::MayIUse(platform::avx)) {
-      math::VecActivations<T, platform::avx> act_functor;
+      phi::funcs::VecActivations<T, platform::avx> act_functor;
       fc_act = act_functor(fc_act_str);
     } else {
-      math::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
       fc_act = act_functor(fc_act_str);
     }
 
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index 8da900d84f9bcedd5e4b318837fe1bb29697a6be..7d7d6ae81a0935402f94cbc16e31fbba8009ce9c 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/phi/core/ddim.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,48 +24,10 @@ class GatherNdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherNdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherNdOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherNdOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_size = x_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], x_dims_size,
-        platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-    PADDLE_ENFORCE_GE(index_dims_size, 1UL,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
-
-    std::vector<int64_t> result_dims;
-    // The result dims is
-    //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
-    for (int i = 0; i < index_dims_size - 1; ++i) {
-      result_dims.emplace_back(index_dims[i]);
-    }
-    for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
-      result_dims.emplace_back(x_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(result_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<framework::Tensor>("X");
     const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     return framework::OpKernelType(
         x_type,
@@ -80,11 +41,6 @@ class GatherNdGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,23 +129,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X");
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdGradInferMeta));
+
 REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
                   ops::GatherNdGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherNdGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherNdGradOpMaker<paddle::imperative::OpBase>,
+                  GatherNdInferShapeFunctor);
 
 REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp,
-                  ops::GatherNdGradNoNeedBufferVarInferer);
-
-REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel<float>,
-                       ops::GatherNdOpKernel<double>,
-                       ops::GatherNdOpKernel<int64_t>,
-                       ops::GatherNdOpKernel<int>,
-                       ops::GatherNdOpKernel<int16_t>,
-                       ops::GatherNdOpKernel<bool>,
-                       ops::GatherNdOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel<float>,
-                       ops::GatherNdGradOpKernel<double>,
-                       ops::GatherNdGradOpKernel<int64_t>,
-                       ops::GatherNdGradOpKernel<int>,
-                       ops::GatherNdGradOpKernel<uint8_t>);
+                  ops::GatherNdGradNoNeedBufferVarInferer,
+                  GatherNdGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
deleted file mode 100644
index 338c44116183415ab09881c470e6d34283b015ed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    const auto &index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s], but "
-            "desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUGatherNd<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUScatterNdAdd<T, int>(dev_ctx, *dO, *index, dX);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<float>,
-                        ops::GatherNdOpCUDAKernel<double>,
-                        ops::GatherNdOpCUDAKernel<int64_t>,
-                        ops::GatherNdOpCUDAKernel<int>,
-                        ops::GatherNdOpCUDAKernel<int16_t>,
-                        ops::GatherNdOpCUDAKernel<bool>,
-                        ops::GatherNdOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel<float>,
-                        ops::GatherNdGradOpCUDAKernel<double>,
-                        ops::GatherNdGradOpCUDAKernel<int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<int>,
-                        ops::GatherNdGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
deleted file mode 100644
index d54261008e47b89151248a8372ede4b524d999bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherNdOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-
-    auto index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::CPUGatherNd<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    auto index_type = index->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
index 995ab5d0ddf0fda19a163ec31a00a14985b5dbb9..c916f44b874a08a13fb967aae1f8b6a136023b31 100644
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc
index 9f4c522bd145bedd09fd746781cef5efec15c139..d4cb799e825b640a2a4e0a464e18d63c5e5ed516 100644
--- a/paddle/fluid/operators/gather_nd_op_xpu.cc
+++ b/paddle/fluid/operators/gather_nd_op_xpu.cc
@@ -11,7 +11,10 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -20,9 +23,9 @@ template <typename T>
 class GatherNdXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<framework::Tensor>("X");
+    auto *index = ctx.Input<framework::Tensor>("Index");
+    auto *out = ctx.Output<framework::Tensor>("Out");
 
     out->template mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 8f1d9284c503813ef3dd9688891048a5bca57b29..e0db2f26d3e0534f924cc709b98689fb3f1a5cc6 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
         axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
       } else if (axis_type == framework::proto::VarType::INT64) {
         axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
+      } else if (axis_type == framework::proto::VarType::INT16) {
+        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
       }
     }
     const auto &place = ctx.GetPlace();
@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
       } else if (index_type == framework::proto::VarType::INT64) {
         phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
                                                      dev_ctx);
+      } else if (index_type == framework::proto::VarType::INT16) {
+        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
+                                                     dev_ctx);
       }
       return;
     }
@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
       phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
     } else if (index_type == framework::proto::VarType::INT64) {
       phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
+    } else if (index_type == framework::proto::VarType::INT16) {
+      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
                         ops::GatherOpCUDAKernel<int>,
+                        ops::GatherOpCUDAKernel<int16_t>,
                         ops::GatherOpCUDAKernel<plat::float16>,
                         ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index a83abb245224baf837296aa6be8f6ceb96ac700c..21093f585b59eea24a231b4dcdf264dc16178fbd 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/kron_op.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 2868c3697eda19ed3e7cc1fb4c74e9beeaca9c0d..c84e94f5c71277c4fe8f25b73b266169f0d0877a 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
-
-    auto ids_dims = ctx->GetInputDim("Ids");
-    auto parents_dims = ctx->GetInputDim("Parents");
-    PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Parents) must be same with the "
-                          "shape of Input(Ids)."));
-    ctx->SetOutputDim("Out", ids_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -72,4 +61,8 @@ selected ids.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PD_INFER_META(phi::GatherTreeMeta));
+
+REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
+                  GatherTreeInferShapeFunctor);
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 774ff0bd065995916562061784f5218336a9da93..66eecc13d04d1aa7d4532b69f7a2fbe8c62b8e6f 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,38 +15,19 @@ limitations under the License. */
 #include <random>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-
-    std::normal_distribution<T> dist(mean, std);
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-    int64_t size = tensor->numel();
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
@@ -75,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom");
-
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    if (shape.empty() && ctx->HasInput("ShapeTensor")) {
-      auto shape_dims = ctx->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_dims));
-
-      return;
-    }
-    if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
-      PADDLE_ENFORCE_GT(
-          shape.size(), 0UL,
-          platform::errors::InvalidArgument(
-              "Attribute(shape) of GaussianRandomOp must be set "
-              "and shape.size() > 0, but reveived shape.size() is %d",
-              shape.size()));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(temp));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -192,13 +141,20 @@ Used to initialize tensors with gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
-                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
+
+DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor,
+                            PD_INFER_META(phi::GaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    GaussianRandomInferShapeFunctor);
+
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
+
 REGISTER_OP_VERSION(gaussian_random)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 21d827c79200c4a368ce7677b01b18ee4ddedb8d..00ce10bfe3bccb404bce9f681ee3c7030e0fa4c4 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -19,9 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
+
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 
 DECLARE_bool(use_curand);
 
@@ -44,7 +45,8 @@ struct GaussianGenerator {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     using MT = typename details::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(mean_, std_);
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
     MT out = dist(rng);
@@ -52,53 +54,6 @@ struct GaussianGenerator {
   }
 };
 
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::normal_distribution<MT> dist;
-        distribution::normal_transform<MT> trans(mean, std);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        auto func =
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
-        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-      }
-    } else {
-      auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    }
-  }
-};
-
 template <typename T>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
@@ -126,21 +81,16 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
       int64_t gen_offset = size * seed_offset.second;
       auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
                                        seed_offset.second);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
       auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    gaussian_random,
-    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
-    paddle::operators::GPUGaussianRandomKernel<float>,
-    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
     gaussian_random_batch_size_like,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 6b778eee4345170a0288bc5741c6c1078615022f..ef836ab72f001a540e081d7e9975ca5ee28758be 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
       static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
   size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
   for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
     ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
@@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
       static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
   size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
   for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
     ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
     ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
 #pragma unroll
@@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
   do {                                                                        \
     constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
     if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
         is_aligned(y, kAlignment)) {                                          \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
@@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
   do {                                                                        \
     constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
     if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
         is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
         is_aligned(x_g, kAlignment)) {                                        \
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index 00ff7ad2166dcf99d7b60ec45adfe70b478dedf8..f3ac53138328dbfad12c6d530a6517f40c658677 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index 6af8388d9eba4e4ea8fbb833f84a5c06e182b1f2..f7c006dbcb1a9a23ec619c8d790df8a093530eee 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/graph_send_recv_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv");
-
-    auto src_index_dims = ctx->GetInputDim("Src_index");
-    if (src_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(src_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Src_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            src_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          src_index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The Src_index should be 1D, when it is not 2D, but we get %d",
-              src_index_dims.size()));
-    }
-
-    auto dst_index_dims = ctx->GetInputDim("Dst_index");
-    if (dst_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(dst_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Dst_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            dst_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dst_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The Dst_index should be 1D, "
-                                            "when it is not 2D, but we get %d",
-                                            dst_index_dims.size()));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        src_index_dims[0], dst_index_dims[0],
-        platform::errors::InvalidArgument(
-            "Src_index and Dst_index should have the same shape."));
-
-    auto dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
-                     "GraphSendRecv");
-      ctx->SetOutputDim("Dst_count", {dims[0]});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor,
+                            PD_INFER_META(phi::GraphSendRecvInferMeta));
 REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
                   ops::GraphSendRecvOpMaker,
                   ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
+                  GraphSendRecvInferShapeFunctor);
 REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
-REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel<CPU, float>,
-                       ops::GraphSendRecvOpKernel<CPU, double>,
-                       ops::GraphSendRecvOpKernel<CPU, int>,
-                       ops::GraphSendRecvOpKernel<CPU, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(graph_send_recv_grad,
-                       ops::GraphSendRecvGradOpKernel<CPU, float>,
-                       ops::GraphSendRecvGradOpKernel<CPU, double>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
deleted file mode 100644
index f43d31814ac38430d2d473eeca548b63e1a5c1fa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/graph_send_recv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename IndexT>
-struct GraphSendRecvSumCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMaxCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMinCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-__global__ void GraphSendRecvCUDAKernel(const T* params,
-                                        const IndexT* src_indices,
-                                        const IndexT* dst_indices, T* output,
-                                        size_t index_size, size_t slice_size,
-                                        Functor functor) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    functor(params, output, in_i, out_i);
-  }
-}
-
-// For max
-template <typename T>
-__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::min()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// For min
-template <typename T>
-__global__ void InputResetMinCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::max()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// Get dst_count
-template <typename T, typename IndexT>
-__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices,
-                                       size_t index_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
-    IndexT dst_i = dst_indices[i];
-    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
-  }
-}
-
-// For forward mean
-template <typename T>
-__global__ void ManipulateMeanCUDAKernel(T* output, int* count,
-                                         size_t input_size, size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    int64_t c_index = i / slice_size;
-    if (*(count + c_index) > 1) {
-      *(output + i) = *(output + i) / *(count + c_index);
-    }
-  }
-}
-
-// For backward mean
-template <typename T, typename IndexT>
-__global__ void ManipulateMeanGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const int* dst_count) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(output + out_i,
-                                    *(params + in_i) / dst_count[src_i]);
-  }
-}
-
-// For backward min and max
-template <typename T, typename IndexT>
-__global__ void ManipulateMinMaxGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const T* ptr_input,
-    const T* ptr_output) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(
-        output + out_i,
-        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* Y = ctx.Output<Tensor>("Out");
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  if (pool_type == "SUM" || pool_type == "MEAN") {
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_output, 0, memset_bytes);
-#else
-    cudaMemset(p_output, 0, memset_bytes);
-#endif
-  } else if (pool_type == "MAX") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::min());
-  } else if (pool_type == "MIN") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::max());
-  }
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MAX") {
-    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMaxCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_max =
-        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
-    InputResetMaxCUDAKernel<
-        T><<<grid_max, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MIN") {
-    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMinCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_min =
-        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
-    InputResetMinCUDAKernel<
-        T><<<grid_min, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MEAN") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_dst_count, 0, input_size * sizeof(int));
-#else
-    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
-#endif
-
-    int64_t grid_count = (index_size + block - 1) / block;
-    ComputeCountCUDAKernel<
-        T, IndexT><<<grid_count, block, 0,
-                     reinterpret_cast<const platform::CUDADeviceContext&>(
-                         ctx.device_context())
-                         .stream()>>>(p_dst_count, d_index, index_size);
-
-    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_mean =
-        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
-    ManipulateMeanCUDAKernel<
-        T><<<grid_mean, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, p_dst_count, input_size, slice_size);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-
-#ifdef PADDLE_WITH_HIP
-  hipMemset(p_output, 0, memset_bytes);
-#else
-  cudaMemset(p_output, 0, memset_bytes);
-#endif
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    ManipulateMeanGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, s_count);
-  } else if (pool_type == "MAX" || pool_type == "MIN") {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Input<Tensor>("Out");
-    const T* ptr_input = input->data<T>();
-    const T* ptr_output = output->data<T>();
-    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, ptr_input,
-                                         ptr_output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto* dst_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h
deleted file mode 100644
index 8d8111e0ee845bf6828ee53459e6d86bdebba484..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct GraphSendRecvSumFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    eigen_dst += eigen_src;
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMinFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMin(eigen_src);
-    }
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMaxFunctor {
-  void operator()(const int& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMax(eigen_src);
-    }
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-void elementwise_inner_operation(const Tensor& src, Tensor* dst,
-                                 const IndexT& src_index,
-                                 const IndexT& dst_index,
-                                 const bool& first_flag, Functor functor) {
-  auto src_slice = src.Slice(src_index, src_index + 1);
-  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
-
-  functor(first_flag, src_slice, &dst_slice);
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size,
-                                  const IndexT* s_index, const IndexT* d_index,
-                                  const Tensor& src, Tensor* dst,
-                                  const std::string& pool_type,
-                                  int* dst_count = nullptr) {
-  Functor functor;
-  if (pool_type == "SUM") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-    for (int i = 0; i < index_size; ++i) {
-      IndexT dst_idx = d_index[i];
-      *(dst_count + dst_idx) += 1;
-    }
-    for (int i = 0; i < input_size; ++i) {
-      if (*(dst_count + i) == 0) continue;
-      auto dst_slice = dst->Slice(i, i + 1);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    std::set<IndexT> existed_dst;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
-      if (!in_set) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                        dst_idx, true, functor);
-        existed_dst.emplace(dst_idx);
-      } else {
-        elementwise_inner_operation<T, IndexT, Functor>(
-            src, dst, src_idx, dst_idx, false, functor);
-      }
-    }
-  }
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop_grad(
-    const int& input_size, const int& index_size, const IndexT* s_index,
-    const IndexT* d_index, const Tensor& src, Tensor* dst,
-    const std::string& pool_type, const int* dst_count = nullptr,
-    const Tensor* input = nullptr, const Tensor* output = nullptr) {
-  if (pool_type == "SUM") {
-    Functor functor;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      auto src_slice = src.Slice(src_idx, src_idx + 1);
-      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& forward_src_idx = d_index[i];
-      const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
-      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);
-      auto eigen_output = framework::EigenVector<T>::Flatten(output_slice);
-
-      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += eigen_src * (eigen_output == eigen_input);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                       const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* dst_index = ctx.Input<Tensor>("Dst_index");
-  auto* Y = ctx.Output<Tensor>("Out");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MIN") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MAX") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
-        p_dst_count);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* dst_index = ctx.Input<Tensor>("Src_index");
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    const auto* input = ctx.Input<Tensor>("X");
-    const auto* output = ctx.Input<Tensor>("Out");
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
-        input, output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
-                                                                   *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx,
-                                                                   *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 2d284fb516e62b08fb48ab96d2478675c495c6f6..4331523d26edc1012ff67e4a08f69d682753bb7a 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance",
                    "GroupNormGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad");
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
                    framework::GradVarName("Y"), "GroupNormGrad");
 
@@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker<T> {
 
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("group_norm_grad");
+    op->SetInput("X", this->Input("X"));
     op->SetInput("Scale", this->Input("Scale"));
     op->SetInput("Bias", this->Input("Bias"));
     op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
     op->SetInput("Y", this->Output("Y"));
+    op->SetInput("Mean", this->Output("Mean"));
     op->SetInput("Variance", this->Output("Variance"));
 
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index b376334f1e93cc3be9e716d808525011edb29b94..ab8c50d90b8ece68b8e4e05d46cecd13fa84d7e0 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
-template <typename T, typename AccT, int VecSize>
-__device__ __forceinline__ void ThreadReduce(const T* input, int size,
-                                             const int offset, AccT* mean,
-                                             AccT* var) {
+template <typename T, typename AccT, int VecSize, int Num>
+__device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
+                                             int size, const int offset,
+                                             AccT* out_mean, AccT* out_var) {
+  const T* x = arrs[0];
+  const T* y;
+  if (Num == 2) {
+    y = arrs[1];
+  }
   using VecT = kps::details::VectorType<T, VecSize>;
   int tid = threadIdx.x;
   if (offset > 0) {
-    input -= offset;
+    x -= offset;
+    if (Num == 2) {
+      y -= offset;
+    }
     size += offset;
     if (tid >= offset) {
-      AccT temp = input[tid];
-      *mean += temp;
-      *var += temp * temp;
+      if (Num == 1) {
+        *out_mean += x[tid];
+        *out_var += x[tid] * x[tid];
+      } else if (Num == 2) {
+        *out_mean += y[tid];
+        *out_var += y[tid] * x[tid];
+      }
     }
     size -= blockDim.x;
-    input += blockDim.x;
+    x += blockDim.x;
+    if (Num == 2) {
+      y += blockDim.x;
+    }
   }
   int remain = size % (VecSize * blockDim.x);
 
-  T ins[VecSize];
-  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+  T ins_x[VecSize];
+  T ins_y[VecSize];
+  VecT* ins_vec_x = reinterpret_cast<VecT*>(&ins_x);
+  VecT* ins_vec_y = reinterpret_cast<VecT*>(&ins_y);
 
   // vector part
   for (; VecSize * tid < (size - remain); tid += blockDim.x) {
-    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+    *ins_vec_x = reinterpret_cast<const VecT*>(x)[tid];
+    if (Num == 2) {
+      *ins_vec_y = reinterpret_cast<const VecT*>(y)[tid];
+    }
 
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      AccT temp = ins[i];
-      *mean += temp;
-      *var += temp * temp;
+      if (Num == 1) {
+        *out_mean += ins_x[i];
+        *out_var += ins_x[i] * ins_x[i];
+      } else if (Num == 2) {
+        *out_mean += ins_y[i];
+        *out_var += ins_y[i] * ins_x[i];
+      }
     }
   }
 
   // scalar part
   tid = size - remain + threadIdx.x;
   for (; tid < size; tid += blockDim.x) {
-    AccT temp = input[tid];
-    *mean += temp;
-    *var += temp * temp;
+    if (Num == 1) {
+      *out_mean += x[tid];
+      *out_var += x[tid] * x[tid];
+    } else if (Num == 2) {
+      *out_mean += y[tid];
+      *out_var += y[tid] * x[tid];
+    }
   }
 }
 
@@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
   AccT x_var = static_cast<AccT>(0);
   const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * size;
-  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  phi::Array<const T*, 1> ins;
+  ins[0] = x;
+  ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
+
   x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
       x_mean, kps::AddFunctor<AccT>());
   x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
@@ -310,10 +341,12 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int flags>
-__global__ void GroupNormBackwardGetMeanAndVar(
-    const T* x, const T* scale, const T* bias, const T* d_y, int N, int C,
-    int W, int imsize, int groups, int group_size, T epsilon, T* d_mean,
-    T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) {
+__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale,
+                                               const T* bias, const T* d_y,
+                                               int N, int C, int W, int imsize,
+                                               int groups, int group_size,
+                                               T epsilon, T* d_mean, T* d_var,
+                                               T* d_scale, T* d_bias) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar(
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val, dval;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid] - x_bias;
-      dval = d_y[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
-      dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
+    dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
 
     d_var_data += val * dval;
     d_mean_data += dval * x_scale;
@@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
                                   const T* bias, const T* var, const T* d_mean,
                                   const T* d_var, int N, int C, int W,
                                   int imsize, int groups, int group_size,
-                                  T epsilon, T* d_x,
-                                  const DataLayout data_layout) {
+                                  T epsilon, T* d_x) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
   if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    if (data_layout == DataLayout::kNCHW) {
-      T tmp = x[(bid * C + ccid) * imsize + imid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * C + ccid) * imsize + imid];
-      d_x[(bid * C + ccid) * imsize + imid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
-      T v_y = (tmp - x_bias) * x_scale_inv;
-      T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-      d_x[(bid * H + hid) * W * C + wid * C + ccid] =
-          x_var_inv *
-          (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+    int hid = imid / W;
+    int wid = imid % W;
+    T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
+    T v_y = (tmp - x_bias) * x_scale_inv;
+    T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
+    d_x[(bid * H + hid) * W * C + wid * C + ccid] =
+        x_var_inv *
+        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+  }
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
+                                            T* ds, T* db) {
+  int i = blockIdx.x;
+  AccT ds_sum = static_cast<AccT>(0);
+  AccT db_sum = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * imsize;
+
+  phi::Array<const T*, 2> ins;
+  ins[0] = x;
+  ins[1] = dy;
+  ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
+                                    &ds_sum);
+
+  ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      ds_sum, kps::AddFunctor<AccT>());
+  db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      db_sum, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ds[i] = ds_sum;
+    db[i] = db_sum;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
+                                        T* ds, T* db) {
+  const int nc = blockIdx.x;
+  T ds_sum = 0;
+  T db_sum = 0;
+  for (int i = threadIdx.x; i < imsize; i += blockDim.x) {
+    const int index = nc * imsize + i;
+    ds_sum += dy[index] * x[index];
+    db_sum += dy[index];
+  }
+  CudaAtomicAddWithWarp(&ds[nc], ds_sum);
+  CudaAtomicAddWithWarp(&db[nc], db_sum);
+}
+
+template <typename T>
+__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group,
+                                               T epsilon, const T* mean,
+                                               const T* var, const T* ds,
+                                               const T* db, T* d_scale,
+                                               T* d_bias) {
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int G = group;
+    const int D = C / G;
+    T sum1 = 0;
+    T sum2 = 0;
+    for (int n = 0; n < N; ++n) {
+      const int nc = n * C + c;
+      const int ng = n * G + c / D;
+      sum1 += (d_scale == nullptr)
+                  ? T(0)
+                  : ((ds[nc] - db[nc] * static_cast<T>(mean[ng])) *
+                     static_cast<T>(rsqrt(var[ng] + epsilon)));
+      sum2 += (d_bias == nullptr) ? T(0) : db[nc];
+    }
+    if (d_scale != nullptr) {
+      d_scale[c] = sum1;
+    }
+    if (d_bias != nullptr) {
+      d_bias[c] = sum2;
     }
   }
 }
 
+template <typename T, int BlockDim>
+__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups,
+                                            int group_size, T epsilon,
+                                            const T* mean, const T* var,
+                                            const T* scale, const T* ds,
+                                            const T* db, T* p1, T* p2, T* p3) {
+  const int n = blockIdx.x;
+  const int g = blockIdx.y;
+  const int ng = n * groups + g;
+  T sum1 = 0;
+  T sum2 = 0;
+  T var_inv = rsqrt(var[ng] + epsilon);
+  for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) {
+    const int64_t index = ng * group_size + i;
+    const int64_t c = g * group_size + i;
+    const T scale_v = scale == nullptr ? T(1) : static_cast<T>(scale[c]);
+    sum1 += ds[index] * scale_v;
+    sum2 += db[index] * scale_v;
+    const T scale_c = scale == nullptr ? T(0) : static_cast<T>(scale[c]);
+    p1[index] = scale_c * var_inv;
+  }
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum());
+  sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    const T s = T(1) / static_cast<T>(group_size * imsize);
+    const T x = (sum2 * static_cast<T>(mean[ng]) - sum1) *
+                static_cast<T>(var_inv) * static_cast<T>(var_inv) *
+                static_cast<T>(var_inv) * s;
+    p2[ng] = x;
+    p3[ng] = -x * static_cast<T>(mean[ng]) - sum2 * static_cast<T>(var_inv) * s;
+  }
+}
+
+template <typename T>
+__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size,
+                                       int groups, T* p1, T* p2, T* p3,
+                                       const T* x, const T* dy, T* dx) {
+  int cid = blockIdx.x;
+  int gid = blockIdx.y;
+  int bid = blockIdx.z;
+  int ccid = bid * C + gid * group_size + cid;
+  int ng = bid * groups + gid;
+  int nc = gid * group_size + cid;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    int index = (bid * C + nc) * imsize + imid;
+    dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng];
+  }
+}
+
 template <typename T>
 class GroupNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -408,7 +552,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* mean = ctx.Input<Tensor>("Mean");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
     auto* bias = ctx.Input<Tensor>("Bias");
@@ -433,31 +579,27 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
-    Tensor temp_var;
-    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-    T* temp_var_data = temp_var.data<T>();
-
-    Tensor temp_mean;
-    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
-    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
-    T* temp_mean_data = temp_mean.data<T>();
+    Tensor ds, db;
+    ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
+    db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
+    T* ds_data = ds.data<T>();
+    T* db_data = db.data<T>();
 
+    auto* y_data = y->data<T>();
     auto* x_data = x->data<T>();
     T* d_x_data = nullptr;
     if (d_x) d_x_data = d_x->data<T>();
-    auto* y_data = d_y->data<T>();
+    auto* dy_data = d_y->data<T>();
     auto* var_data = var->data<T>();
+    auto* mean_data = mean->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
       d_scale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_scale, static_cast<T>(0));
       d_scale_data = d_scale->data<T>();
     }
     T* d_bias_data = nullptr;
     if (d_bias) {
       d_bias->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_bias, static_cast<T>(0));
       d_bias_data = d_bias->data<T>();
     }
 
@@ -479,22 +621,103 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
 
 #ifdef __HIPCC__
     int block_size = std::max(std::min(256, imsize), 64);
+    const int block_dims = 256;
 #else
     int block_size = std::min(1024, imsize);
+    const int block_dims = 1024;
 #endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data,
-                     bias_data, y_data, x_dims[0], C, W, imsize, groups,
-                     group_size, epsilon, temp_mean_data, temp_var_data,
-                     d_scale_data, d_bias_data, data_layout);
-    if (d_x_data != nullptr) {
-      UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data,
-                       bias_data, var_data, temp_mean_data, temp_var_data,
-                       x_dims[0], C, W, imsize, groups, group_size, epsilon,
-                       d_x_data, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(imsize / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 blocks(block_size_nchw);
+      if (imsize < vec_size) {
+        if (d_scale) {
+          set_zero(dev_ctx, d_scale, static_cast<T>(0));
+        }
+        if (d_bias) {
+          set_zero(dev_ctx, d_bias, static_cast<T>(0));
+        }
+        ScalarGetDsDbCUDAKernel<
+            T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+            imsize, x_data, dy_data, ds_data, db_data);
+      } else {
+        VectorizedGetDsDbCUDAKernel<
+            T, AccT, vec_size><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+            imsize, x_data, dy_data, ds_data, db_data);
+      }
+
+      if (d_scale || d_bias) {
+        const int block = 256;
+        GetScaleBiasGradientCUDAKernel<
+            T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
+            x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
+            db_data, d_scale_data, d_bias_data);
+      }
+
+      if (d_x_data != nullptr) {
+        // p1 * dy + p2 * x + p3,
+        // p1, p2, p3 represent the reverse calculation of temporary variables
+        // p1 = scale * var_inv
+        // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
+        // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
+        Tensor p1, p2, p3;
+        p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
+        p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
+        p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
+        T* p1_data = p1.data<T>();
+        T* p2_data = p2.data<T>();
+        T* p3_data = p3.data<T>();
+
+        GetBackwardParamsCUDAKernel<T, block_dims><<<
+            dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
+            imsize, groups, group_size, epsilon, mean_data, var_data,
+            scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
+        GetXGradientCUDAKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+            imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
+            dy_data, d_x_data);
+      }
+
+    } else {
+      if (d_scale) {
+        set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      }
+      if (d_bias) {
+        set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      }
+
+      Tensor temp_var;
+      temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+      T* temp_var_data = temp_var.data<T>();
+
+      Tensor temp_mean;
+      temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+      set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+      T* temp_mean_data = temp_mean.data<T>();
+
+      int flags = (scale_data != nullptr) * kHasScale +
+                  (bias_data != nullptr) * kHasBias;
+      UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data,
+                       scale_data, bias_data, dy_data, x_dims[0], C, W, imsize,
+                       groups, group_size, epsilon, temp_mean_data,
+                       temp_var_data, d_scale_data, d_bias_data);
+      if (d_x_data != nullptr) {
+        UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data,
+                         bias_data, var_data, temp_mean_data, temp_var_data,
+                         x_dims[0], C, W, imsize, groups, group_size, epsilon,
+                         d_x_data);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc
index f8f8f3fd789ad61a99bcc17bc073b6cfd099f639..524f2d6c9d719468876d8a586b6eea13f99a7b79 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cc
+++ b/paddle/fluid/operators/gumbel_softmax_op.cc
@@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::GumbelSoftmaxInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
                             GumbelSoftmaxGradInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxGradInferMeta));
+                            PD_INFER_META(phi::GumbelSoftmaxGradInferMeta));
 
 REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp,
                   ops::GumbelSoftmaxOpMaker,
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 3915ce5809c394738c58e80accccac531c268c23..3c9bbc753f29b1cf104a085d340ddc75cf2790f8 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
-                            PT_INFER_META(phi::HuberLossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PD_INFER_META(phi::HuberLossInferMeta));
 
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 567a69f383d1cc20cc1fc5b8c0a5c6f8368824af..16968876ac96cac2fa1b009ea40b939f1e11a953 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
-                            PT_INFER_META(phi::RealAndImagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 105d818e197434c4ed85126228e06d45bf06e498..e2efaa1759b008dd0055bb6e06917cbd4fc1932f 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
-                            PT_INFER_META(phi::IncrementInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
+                            PD_INFER_META(phi::IncrementInferMeta));
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
                   ops::IncrementGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index 09f4e63943ad3784a598524273831bf875ed9213..8324a6215bca8145ba36dabb3d8108006a57e829 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index 2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f..bb26e2f445e7034b8f982594216eacfd3007a24f 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace paddle {
@@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   int numel = out->numel();
   T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
   if (numel <= 0) return;
-  int vec_size = paddle::platform::GetVectorizedSize(out_data);
+  int vec_size = phi::GetVectorizedSize(out_data);
 #ifdef PADDLE_WITH_XPU_KP
   int block = 64;
   int grid = 8;
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 68d002fceea70fd032d7613802d095770d3d4754..d17c6368c7537b93ceb6f1d75b6d73467bd207ac 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -100,8 +100,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
-                            PT_INFER_META(phi::IndexSampleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                   ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
                   ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index e0779249c41adc5005bbaba6e19127d2ced3a9ec..7f5136969980b887bb7bbe013690898e66abeac1 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
-    : public paddle::operators::BatchNormKernel<DeviceContext, T> {
+class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
@@ -213,7 +214,33 @@ class InplaceABNKernel
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    BatchNormKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+        is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+        mean_out, variance_out, saved_mean, saved_variance, reserve_space);
 
     auto cur_y = EigenVector<T>::Flatten(*y);
     InplaceABNActivation<DeviceContext, T> functor;
@@ -222,8 +249,7 @@ class InplaceABNKernel
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
-    : public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Input<Tensor>("Y");
@@ -244,7 +270,52 @@ class InplaceABNGradKernel
     InplaceABNActivation<DeviceContext, T> functor;
     functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
 
-    BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+    paddle::optional<const Tensor&> space_opt = paddle::none;
+    paddle::optional<const Tensor&> mean_opt = paddle::none;
+    paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+    if (reserve_space != nullptr) {
+      space_opt = *reserve_space;
+    }
+
+    if (mean != nullptr) {
+      mean_opt = *mean;
+    }
+
+    if (variance != nullptr) {
+      variance_opt = *variance;
+    }
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormGradRawKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+        mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+        use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+        scale_grad, bias_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index be7a7bd71711e379ef4d98eb1f9ac5ee2caaace6..db8f8c72d13f8e46f6f9e332c5c2f5164b6d0836 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -15,14 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/inplace_abn_op.h"
 #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
@@ -36,7 +37,33 @@ class InplaceABNKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* mean = ctx.Input<Tensor>("Mean");
+      auto* variance = ctx.Input<Tensor>("Variance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* mean_out = ctx.Output<Tensor>("MeanOut");
+      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+          is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+          mean_out, variance_out, saved_mean, saved_variance, reserve_space);
     }
 
     auto cur_y = EigenVector<T>::Flatten(*y);
@@ -49,8 +76,7 @@ class InplaceABNKernel
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
 class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* y = ctx.Input<Tensor>("Y");
@@ -74,7 +100,50 @@ class InplaceABNGradKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+      auto* mean = ctx.Input<Tensor>("ReserveSpace");
+      auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+      paddle::optional<const Tensor&> space_opt = paddle::none;
+      paddle::optional<const Tensor&> mean_opt = paddle::none;
+      paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+      if (reserve_space != nullptr) {
+        space_opt = *reserve_space;
+      }
+
+      if (mean != nullptr) {
+        mean_opt = *mean;
+      }
+
+      if (variance != nullptr) {
+        variance_opt = *variance;
+      }
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormGradRawKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+          mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+          use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+          scale_grad, bias_grad);
     }
   }
 };
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index d61eb46d97e98972963f5871a4c6e7b06468337c..cd297c53f89a0f7efc622de7c385b9f75dc7462b 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
 
 template <typename T>
 __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex(
-    int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w,
-    const int in_img_w) {
-  src_w = (src_w > 0) ? src_w : 0.f;
-  *in_img_idx = static_cast<int>(src_w);
-  *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0;
-  *w1lambda = src_w - *in_img_idx;
-  *w2lambda = 1.f - *w1lambda;
+    int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x,
+    const int in_img_x) {
+  src_x = (src_x > 0) ? src_x : 0.f;
+  *in_img_idx = static_cast<int>(src_x);
+  *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0;
+  *lambda1 = src_x - *in_img_idx;
+  *lambda2 = 1.f - *lambda1;
 }
 
 struct FastDivModForInterpolate {
@@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory(
   }
 }
 
+__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height,
+                                             const int width, const int h,
+                                             const int w) {
+  return (nc * height + h) * width + w;
+}
+
+template <typename T>
+__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w,
+                                       const int out_h, const int out_w,
+                                       const int n, const int num_channels,
+                                       float ratio_h, float ratio_w,
+                                       const T* __restrict__ out,
+                                       const T align_type_value) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num_out = n * num_channels * out_h * out_w;
+  int num_in = n * num_channels * in_h * in_w;
+
+  for (; index < num_out; index += stride) {
+    int index_tmp = index;
+    int w2 = index_tmp % out_w;
+    index_tmp /= out_w;
+    int h2 = index_tmp % out_h;
+    int nc = index_tmp / out_h;
+
+    int h1, y_id;
+    T h1lambda, h0lambda;
+    T src_y = ratio_h * (h2 + align_type_value) - align_type_value;
+
+    PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda,
+                                           src_y, in_h);
+    int w1, x_id;
+    T w1lambda, w0lambda;
+    T src_x = ratio_w * (w2 + align_type_value) - align_type_value;
+    PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda,
+                                           src_x, in_w);
+
+    T d2val = out[index];
+
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1),
+                            h0lambda * w0lambda * d2val);
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id),
+                            h0lambda * w1lambda * d2val);
+    platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1),
+                            h1lambda * w0lambda * d2val);
+    platform::CudaAtomicAdd(
+        in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id),
+        h1lambda * w1lambda * d2val);
+  }
+}
+
 template <typename T>
 __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w,
                                    const T* __restrict__ out, const int out_h,
                                    const int out_w, const int n,
-                                   const int num_channels, float ratio_h,
-                                   float ratio_w, const T align_type_value,
-                                   bool is_nchw) {
+                                   const int out_chw, const int num_channels,
+                                   float ratio_h, float ratio_w,
+                                   const T align_type_value,
+                                   FastDivModForInterpolate divmods) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int in_chw = in_h * in_w * num_channels;
-  int out_chw = num_channels * out_h * out_w;
   int nthreads = n * out_chw;
 
-  if (is_nchw) {
-    for (; tid < nthreads; tid += stride) {
-      int out_id_h = tid / out_chw;
-      int out_id_w = tid % out_chw;
-      const int in_img_size = in_h * in_w;
-      const int out_img_size = out_h * out_w;
-      T value = out[out_id_h * out_chw + out_id_w];
-
-      int channel_id = out_id_w / out_img_size;
-      int out_img_idy = (out_id_w % out_img_size) / out_w;
-      int out_img_idx = tid % out_w;
-      int in_img_idx, in_img_idy, w_id, h_id;
-      T w1lambda, h1lambda, w2lambda, h2lambda;
-
-      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
-
-      PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
-                                             &w2lambda, src_w, in_w);
-      PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
-                                             &h2lambda, src_h, in_h);
-
-      T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size +
-                      in_img_idy * in_w + in_img_idx];
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w],
-                              h1lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id],
-                              h1lambda * w1lambda * value);
-    }
-  } else {
-    for (; tid < nthreads; tid += stride) {
-      int out_id_h = tid / out_chw;
-      int out_id_w = tid % out_chw;
-      const int in_img_size = in_h * in_w;
-      const int out_img_size = out_h * out_w;
-      T value = out[out_id_h * out_chw + out_id_w];
-
-      int out_img_idy = out_id_w / (out_w * num_channels);
-      int out_img_idx = out_id_w % (out_w * num_channels) / num_channels;
-      int channel_id = tid % num_channels;
-
-      int in_img_idx, in_img_idy, w_id, h_id;
-      T w1lambda, h1lambda, w2lambda, h2lambda;
-      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
-
-      PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
-                                             &w2lambda, src_w, in_w);
-      PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
-                                             &h2lambda, src_h, in_h);
-
-      T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
-                      in_img_idx * num_channels + channel_id];
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * value);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
-                              h1lambda * w2lambda * value);
-      platform::CudaAtomicAdd(
-          &in_pos[h_id * in_w * num_channels + w_id * num_channels],
-          h1lambda * w1lambda * value);
-    }
+  for (; tid < nthreads; tid += stride) {
+    auto out_id_divmod = divmods.output_w_div.Divmod(tid);
+    int out_id_h = out_id_divmod.val[0];
+    int out_id_w = out_id_divmod.val[1];
+
+    int channel_id = divmods.channels_div.Divmod(tid).val[1];
+    auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
+    int out_img_idy = outimg_id_divmod.val[0];
+    int out_img_idx =
+        divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
+
+    int in_img_idx, in_img_idy, w_id, h_id;
+    T w1lambda, h1lambda, w2lambda, h2lambda;
+    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+
+    PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
+                                           &w2lambda, src_w, in_w);
+    PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
+                                           &h2lambda, src_h, in_h);
+
+    T value = out[tid];
+    T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
+    platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                            h2lambda * w1lambda * value);
+    platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
+                            h1lambda * w2lambda * value);
+    platform::CudaAtomicAdd(
+        &in_pos[h_id * in_w * num_channels + w_id * num_channels],
+        h1lambda * w1lambda * value);
   }
 }
 
@@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
           ratio_h, ratio_w, align_type_value, is_nchw);
+    } else if (!optimize_flag & is_nchw) {
+      //
+      const int num_kernels = n * c * out_h * out_w;
+      const int num_threads =
+          std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024);
+      KeBilinearInterpNCHWBw<
+          T><<<platform::DivUp(num_kernels, num_threads), num_threads, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w,
+          output_grad_data, align_type_value);
     } else {
+      int64_t cw = c * out_w;
+      auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
       KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                               ctx.cuda_device_context().stream()>>>(
-          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
-          ratio_h, ratio_w, align_type_value, is_nchw);
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods);
     }
   } else if ("bicubic" == interp_method) {
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h
index 1e061d8b50ae02f9b87f0a0976543467aa0b7dd0..31c22915ec5d052eb11c613d476f6aea541d8c47 100644
--- a/paddle/fluid/operators/inverse_op.h
+++ b/paddle/fluid/operators/inverse_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *input, output);
   }
 };
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 2750367dc773925e998507db4690e39c15f985d0..c835bb3cf60bfbf71b585828c74ac45f6bc91f8b 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/is_empty_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty");
-    ctx->SetOutputDim("Out", {1});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto *x = ctx.Input<framework::LoDTensor>("X");
@@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor,
+                            PD_INFER_META(phi::IsEmptyInferMeta));
 REGISTER_OPERATOR(
     is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsEmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc
deleted file mode 100644
index 3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/is_empty_op.cu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc
index 0ae7a9fa02f1fb217555ae41d8b25cbba0e43d19..8668de4d3a6288841ad191f3e47b87a76eeb1d63 100644
--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isclose_op.h"
 #include <cmath>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@@ -23,45 +22,6 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct IscloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    // *out_data = true;
-    for (int i = 0; i < num; i++) {
-      out_data[i] = true;
-    }
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      // *out_data &= val;
-      out_data[i] = val;
-    }
-  }
-};
-
 class IscloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -154,12 +114,9 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(
     isclose, ops::IscloseOp, ops::IscloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::IscloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel<CPU, float>,
-                       ops::IscloseKernel<CPU, double>);
diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu
deleted file mode 100644
index 09710ba0c6957d39318abfc24113d4b9db11622d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isclose_op.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/isclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data,
-                                  const double rtol, const double atol,
-                                  bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    out_data[i] = val;
-    // if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct IscloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, num * sizeof(bool));
-#else
-    cudaMemset(out_data, true, num * sizeof(bool));
-#endif
-    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel<CUDA, float>,
-                        ops::IscloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h
deleted file mode 100644
index cde5d2afbf009a16a3d0c3601697703d8ec8eb7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct IscloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class IscloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    IscloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                       equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 735fffa7203b1213fccec0c4098048e85a6b24f8..cfa370ff9cb19dfb7d488b03cba52c115083cdc8 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel {
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
-    UnaryOpUnchangedInferShape(ctx);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -104,6 +101,14 @@ element of X as a tensor.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
 
 #define REGISTER_V2OP_MAKER(op_type, comment)           \
   namespace paddle {                                    \
@@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
 REGISTER_OPERATOR(
     isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsinfInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsnanInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(isnan_v2,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsfiniteInferShapeFunctor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
deleted file mode 100644
index 1b9f19d36dfa0f590f96577295ffb12e4456d2e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/isfinite_v2_op.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(isnan_v2,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index a78d8ec10149db5a1f8d585cb06bb08ea6ca5a5f..dcd98054b05c314da0884e8dc6be358d3afb0483 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -9,7 +9,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@@ -177,10 +176,3 @@ REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
                   ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
                   ops::KLDivLossGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu
deleted file mode 100644
index 5226cb8c08e3db4a0bfbbe4440c27264903f06e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/kldiv_loss_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
deleted file mode 100644
index 5a6ef06f5eb1e855c8a528664528c9919304c7b9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using Array1 = Eigen::DSizes<int64_t, 1>;
-
-template <typename T>
-struct KLDivLossForward {
-  HOSTDEVICE KLDivLossForward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& input) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return target * (std::log(target) - input);
-    }
-  }
-};
-
-template <typename T>
-struct KLDivLossBackward {
-  HOSTDEVICE KLDivLossBackward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& grad) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return static_cast<T>(-1.) * grad;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    const int n = input->dims()[0];
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = framework::EigenVector<T>::Flatten(*input);
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-    auto loss_t = framework::EigenVector<T>::Flatten(*loss);
-    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
-    if ("none" == reduction) {
-      loss_t.device(place) = output;
-    } else if ("batchmean" == reduction) {
-      auto output_sum = output.sum();
-      if (n > 0) {
-        loss_t.device(place) = output_sum / output_sum.constant(n);
-      } else {
-        loss_t.device(place) = output_sum;
-      }
-    } else if ("mean" == reduction) {
-      loss_t.device(place) = output.mean();
-    } else if ("sum" == reduction) {
-      loss_t.device(place) = output.sum();
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* target = ctx.Input<Tensor>("Target");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    const int n = input_grad->dims()[0];
-    const int numel = input_grad->numel();
-    const int expand = numel / loss_grad->numel();
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-
-    auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
-
-    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
-    auto grad_t = target_t * loss_grad_expand;
-    input_grad_t.device(place) =
-        target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
-
-    if ("mean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
-    } else if ("batchmean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index 322ae5df4cb877b4dde022e6c203a32cd8dd001d..eac181489aa9d09f4661c898b13e77570ad928a8 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index 58d51ab1c723f296d3728a23de95a116acbb4df3..68d0c7978b4e45f216abd5fa5c4be93f788e8f04 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -178,27 +176,4 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
                   ops::KronGradOpMaker<paddle::framework::OpDesc>,
                   ops::KronGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kron, ops::KronKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::float16>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
-REGISTER_OP_CPU_KERNEL(
-    kron_grad, ops::KronGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::float16>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu
deleted file mode 100644
index e5124e65007509568ae8cd8ab65b33c504a12fe9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kron_op.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kron, ops::KronKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    kron_grad, ops::KronGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h
deleted file mode 100644
index 274b47c03a4d3d381dceda43d502a6e2d14669a5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/kron_op.h
+++ /dev/null
@@ -1,415 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "thrust/device_vector.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-// Process an element in the output, used with a parallel-for
-template <typename T>
-struct KronElemFunctor {
-  KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b,
-                  const int64_t* stride_a, const int64_t* stride_b,
-                  const int64_t* stride_out, int ndims)
-      : a_(a),
-        b_(b),
-        out_(out),
-        shape_b_(shape_b),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        stride_out_(stride_out),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    // it computes 1 element in the output
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_out_[i];
-      index = index % stride_out_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-    out_[idx] = a_[index_a] * b_[index_b];
-  }
-
- private:
-  const T* a_;
-  const T* b_;
-  T* out_;
-  const int64_t* shape_b_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* stride_out_;
-  const int ndims_;
-};
-
-template <typename DeviceContext, typename T>
-struct KronOpFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x,
-                  const framework::Tensor& y, framework::Tensor* out) {
-    int ndims = out->dims().size();
-    int64_t numel = out->numel();
-
-    const framework::DDim& dim_x = x.dims();
-    const framework::DDim& dim_y = y.dims();
-    const framework::DDim& dim_out = out->dims();
-    const framework::DDim stride_x = phi::stride(dim_x);
-    const framework::DDim stride_y = phi::stride(dim_y);
-    const framework::DDim stride_out = phi::stride(dim_out);
-
-    const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
-                  *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> d_stride_x(ndims);
-    thrust::device_vector<int64_t> d_stride_y(ndims);
-    thrust::device_vector<int64_t> d_stride_out(ndims);
-    thrust::device_vector<int64_t> d_shape_y(ndims);
-    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
-    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
-    thrust::copy(stride_out.Get(), stride_out.Get() + ndims,
-                 d_stride_out.begin());
-    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
-
-    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
-    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
-    p_stride_out = thrust::raw_pointer_cast(d_stride_out.data());
-    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
-#else
-    p_stride_x = stride_x.Get();
-    p_stride_y = stride_y.Get();
-    p_stride_out = stride_out.Get();
-    p_shape_y = dim_y.Get();
-#endif
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    KronElemFunctor<T> functor(x.data<T>(), y.data<T>(), out->data<T>(),
-                               p_shape_y, p_stride_x, p_stride_y, p_stride_out,
-                               ndims);
-    for_range(functor);
-  }
-};
-
-template <typename T>
-struct KronGradElemFunctor {
-  KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a,
-                      T* dout_b, const int64_t* stride_dout,
-                      const int64_t* stride_a, const int64_t* stride_b,
-                      const int64_t* shape_b, const int64_t numel_a,
-                      const int64_t numel_b, const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] = dout_[idx] * B_[index_b];
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] = dout_[idx] * A_[index_a];
-    }
-  }
-
- private:
-  const T* dout_;
-  const T* A_;
-  const T* B_;
-  T* dout_a_;
-  T* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <typename T>
-struct KronGradElemFunctor<platform::complex<T>> {
-  KronGradElemFunctor(const platform::complex<T>* dout,
-                      const platform::complex<T>* A,
-                      const platform::complex<T>* B,
-                      platform::complex<T>* dout_a,
-                      platform::complex<T>* dout_b, const int64_t* stride_dout,
-                      const int64_t* stride_a, const int64_t* stride_b,
-                      const int64_t* shape_b, const int64_t numel_a,
-                      const int64_t numel_b, const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] =
-          dout_[idx] *
-          platform::complex<T>(B_[index_b].real, -B_[index_b].imag);
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] =
-          dout_[idx] *
-          platform::complex<T>(A_[index_a].real, -A_[index_a].imag);
-    }
-  }
-
- private:
-  const platform::complex<T>* dout_;
-  const platform::complex<T>* A_;
-  const platform::complex<T>* B_;
-  platform::complex<T>* dout_a_;
-  platform::complex<T>* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <typename DeviceContext, typename T>
-struct KronGradOpFunctor {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout,
-                  const framework::Tensor& x, const framework::Tensor& y,
-                  framework::Tensor* dx, framework::Tensor* dy) {
-    int ndims = dout.dims().size();
-    int64_t numel = dout.numel();
-    int64_t numel_x = x.numel();
-    int64_t numel_y = y.numel();
-
-    const framework::DDim& dim_x = x.dims();
-    const framework::DDim& dim_y = y.dims();
-    const framework::DDim& dim_dout = dout.dims();
-
-    const framework::DDim stride_x = phi::stride(dim_x);
-    const framework::DDim stride_y = phi::stride(dim_y);
-    const framework::DDim stride_dout = phi::stride(dim_dout);
-
-    const int64_t* p_stride_x = nullptr;
-    const int64_t* p_stride_y = nullptr;
-    const int64_t* p_stride_dout = nullptr;
-    const int64_t* p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> d_stride_x(ndims);
-    thrust::device_vector<int64_t> d_stride_y(ndims);
-    thrust::device_vector<int64_t> d_stride_dout(ndims);
-    thrust::device_vector<int64_t> d_shape_y(ndims);
-    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
-    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
-    thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims,
-                 d_stride_dout.begin());
-    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
-
-    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
-    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
-    p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data());
-    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
-#else
-    p_stride_x = stride_x.Get();
-    p_stride_y = stride_y.Get();
-    p_stride_dout = stride_dout.Get();
-    p_shape_y = dim_y.Get();
-#endif
-    // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y)
-    // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x)
-    framework::Tensor dout_x;
-    T* p_dout_x = nullptr;
-    if (dx) {
-      dout_x.mutable_data<T>({numel_x, numel_y}, dev_ctx.GetPlace());
-      p_dout_x = dout_x.data<T>();
-    }
-    framework::Tensor dout_y;
-    T* p_dout_y = nullptr;
-    if (dy) {
-      dout_y.mutable_data<T>({numel_y, numel_x}, dev_ctx.GetPlace());
-      p_dout_y = dout_y.data<T>();
-    }
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    KronGradElemFunctor<T> func(dout.data<T>(), x.data<T>(), y.data<T>(),
-                                p_dout_x, p_dout_y, p_stride_dout, p_stride_x,
-                                p_stride_y, p_shape_y, numel_x, numel_y, ndims);
-    for_range(func);
-
-// reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)
-    auto stream = dev_ctx.stream();  // it is a cuda device_context
-    if (dx) {
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
-    }
-    if (dy) {
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
-    }
-#else
-    auto* place = dev_ctx.eigen_device();
-    Eigen::array<int, 1> reduce_dim = {1};
-    if (dx) {
-      auto eigen_dout_x = framework::EigenMatrix<T>::Reshape(dout_x, 1);
-      auto eigen_vec_dx = framework::EigenVector<T>::Flatten(*dx);
-      eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim);
-    }
-    if (dy) {
-      auto eigen_dout_y = framework::EigenMatrix<T>::Reshape(dout_y, 1);
-      auto eigen_vec_dy = framework::EigenVector<T>::Flatten(*dy);
-      eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim);
-    }
-#endif
-  }
-};
-
-inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) {
-  const framework::DDim& shape = src.dims();
-  int rank = shape.size();
-  framework::Tensor res;
-  res.ShareDataWith(src);
-  PADDLE_ENFORCE_LE(
-      rank, ndims,
-      platform::errors::InvalidArgument(
-          "The input Tensor's rank should be less than or equal to ndims"
-          "Received input Tensor's rank = %d, ndims = %d",
-          rank, ndims));
-  if (rank < ndims) {
-    std::vector<int64_t> new_dim(ndims, 1);
-    for (int i = ndims - rank; i < ndims; i++) {
-      new_dim[i] = shape[i - ndims + rank];
-    }
-    res.Resize(phi::make_ddim(new_dim));
-  }
-  return res;
-}
-
-template <typename DeviceContext, typename T>
-class KronKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int ndims = out->dims().size();
-    framework::Tensor xx = UnsqueezeTo(*x, ndims);
-    framework::Tensor yy = UnsqueezeTo(*y, ndims);
-
-    KronOpFunctor<DeviceContext, T> func;
-    func(dev_ctx, xx, yy, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KronGradKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-    }
-
-    int ndims = dout->dims().size();
-    framework::Tensor xx = UnsqueezeTo(*x, ndims);
-    framework::Tensor yy = UnsqueezeTo(*y, ndims);
-
-    framework::Tensor* pdxx = nullptr;
-    framework::Tensor* pdyy = nullptr;
-    framework::Tensor dxx;
-    framework::Tensor dyy;
-    if (dx) {
-      dxx = UnsqueezeTo(*dx, ndims);
-      pdxx = &dxx;
-    }
-
-    if (dy) {
-      dyy = UnsqueezeTo(*dy, ndims);
-      pdyy = &dyy;
-    }
-
-    KronGradOpFunctor<DeviceContext, T> func;
-    func(dev_ctx, *dout, xx, yy, pdxx, pdyy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
index 4f30c58d375008abb3509989f90bcd9fec91fb38..f6f56f70f1a11971b31e679ef879f2d1d0a96085 100644
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/kthvalue_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 62c21dd2eee401e5f8a526870015c18cf13ee873..412ae3c49b5f3cc9fc2422aa220af324e6d99b69 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace paddle {
 namespace operators {
@@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     Vec x[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
       col += THREADS_PER_ROW;
     }
     U xf[LDGS * VecSize];
@@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr,
     T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr,
     T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   int col = c;
 #pragma unroll
   for (int it = 0; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
     col += THREADS_PER_ROW;
   }
 
@@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     int col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      platform::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &dout[it]);
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &dout[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
       if (isFusedDropoutResidualLn) {
-        platform::Load<MaskType, VecSize>(
+        phi::Load<MaskType, VecSize>(
             mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
       }
 
@@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  dx_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize);
       if (isFusedDropoutResidualLn) {
-        platform::Store<T, VecSize>(
+        phi::Store<T, VecSize>(
             dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
       }
       col += THREADS_PER_ROW;
@@ -641,7 +637,7 @@ template <
 __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
     ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
-  using Vec = platform::AlignedVector<U, VecSize>;
+  using Vec = phi::AlignedVector<U, VecSize>;
   static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");
 
   const int tidx = threadIdx.x;
@@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     for (int row = r; row < rows; row += ROWS_PER_CTA) {
       Vec dg;
       Vec db;
-      platform::Load<U, VecSize>(dg_part_ptr, &dg);
-      platform::Load<U, VecSize>(db_part_ptr, &db);
+      phi::Load<U, VecSize>(dg_part_ptr, &dg);
+      phi::Load<U, VecSize>(db_part_ptr, &db);
       dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
       db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
 
diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc
index 0aaefc7ca75eb0f98e35200f0a1940aae07315b2..5e053445379118b37c9b0e0bdcb01adaec65b6c1 100644
--- a/paddle/fluid/operators/lerp_op.cc
+++ b/paddle/fluid/operators/lerp_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,49 +23,6 @@ namespace operators {
 class LerpOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lerp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "lerp");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "lerp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "lerp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    auto w_dims = ctx->GetInputDim("Weight");
-    framework::DDim out_dims;
-    out_dims = GetOutputDims(x_dims, y_dims);
-    if (w_dims.size() > 1 || w_dims[0] != 1) {
-      out_dims = GetOutputDims(out_dims, w_dims);
-    }
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- private:
-  framework::DDim GetOutputDims(const framework::DDim& s_dims,
-                                const framework::DDim& l_dims) const {
-    if (s_dims.size() > l_dims.size()) {
-      return GetOutputDims(l_dims, s_dims);
-    }
-    std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
-    for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
-      int64_t s = s_dims[i];
-      int64_t l = l_dims[j];
-      if (s != l) {
-        if (l == 1) {
-          shapes[j] = s;
-        } else if (s != 1) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "The shape of tensor a %s:%d must match shape of tensor b "
-              "%s:%d.",
-              s_dims.to_str(), i, l_dims.to_str(), j));
-        }
-      }
-    }
-    return phi::make_ddim(shapes);
-  }
 };
 
 class LerpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -125,10 +85,12 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
+                            PD_INFER_META(phi::LerpInferMeta));
 REGISTER_OPERATOR(
     lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker,
     paddle::operators::LerpOpGradMaker<paddle::framework::OpDesc>,
     paddle::operators::LerpOpGradMaker<paddle::imperative::OpBase>,
-    paddle::operators::LerpInplaceInferer);
+    paddle::operators::LerpInplaceInferer, LerpInferShapeFunctor);
 
 REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp);
diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc
index 148fb05afcfd9a4ef1fcbc587a2bd33947a41000..72c6b41efa98922b4ba23fa4b6e1a83f931c701e 100644
--- a/paddle/fluid/operators/lgamma_op.cc
+++ b/paddle/fluid/operators/lgamma_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/lgamma_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -35,16 +38,6 @@ $$out = log\Gamma(x)$$
 class LgammaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename T>
@@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker,
                   ops::LgammaGradMaker<paddle::framework::OpDesc>,
-                  ops::LgammaGradMaker<paddle::imperative::OpBase>);
+                  ops::LgammaGradMaker<paddle::imperative::OpBase>,
+                  LgammaInferShapeFunctor);
 
 REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CPUDeviceContext, double>)
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
deleted file mode 100644
index b9f273727b00bb5ec4398bf82b0a19737ee2387a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lgamma_op.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/lgamma_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CudaLgammaFunctor {
-  __device__ __forceinline__ T operator()(const T x) const {
-    return Eigen::numext::lgamma(x);
-  }
-};
-
-template <typename T>
-class LgammaKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    auto functor = CudaLgammaFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h
deleted file mode 100644
index 674054e74573208ea9bbd537419d202e1a30d8c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/lgamma_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LgammaFunctor {
-  LgammaFunctor(const T* input, T* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = Eigen::numext::lgamma(input_[idx]);
-  }
-
- private:
-  const T* input_;
-  T* output_;
-  int64_t numel_;
-};
-
-template <typename T>
-struct LgammaGradFunctor {
-  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class LgammaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace(),
-                                          size_t(x->numel() * sizeof(T)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LgammaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index fe271fa5e893a750bdbbdc05ac4b7835205ebe66..378c7573d6129abc28bd53dd6f964e5c726cce34 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/linspace_op.h"
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace");
-
-    auto s_dims = ctx->GetInputDim("Start");
-    PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Start) must be [1],"
-                          "but received input shape is [%s].",
-                          s_dims));
-    auto e_dims = ctx->GetInputDim("Stop");
-    PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Stop) must be [1],"
-                          "but received input shape is [%s].",
-                          e_dims));
-    auto step_dims = ctx->GetInputDim("Num");
-    PADDLE_ENFORCE_EQ(
-        (step_dims.size() == 1) && (step_dims[0] == 1), true,
-        platform::errors::InvalidArgument("The shape of Input(Num) must be [1],"
-                                          "but received input shape is [%s].",
-                                          step_dims));
-    ctx->SetOutputDim("Out", {-1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
-REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
-                       ops::CPULinspaceKernel<int32_t>,
-                       ops::CPULinspaceKernel<int64_t>,
-                       ops::CPULinspaceKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor,
+                            PD_INFER_META(phi::LinspaceInferMeta));
+REGISTER_OPERATOR(
+    linspace, ops::LinspaceOp, ops::LinspaceOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    LinspaceInferShapeFunctor);
 
 REGISTER_OP_VERSION(linspace)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
deleted file mode 100644
index aa625a7f5b9df0aa76872c56a3769f1186125bf5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
-                               T* out) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; index < size; index += blockDim.x * gridDim.x) {
-    if (index < size / 2) {
-      out[index] = static_cast<T>(start + step * index);
-    } else {
-      out[index] = static_cast<T>(stop - step * (size - index - 1));
-    }
-  }
-}
-
-template <typename T>
-__global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = static_cast<T>(start);
-}
-
-template <typename T>
-class CUDALinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    auto* num_t = context.Input<framework::Tensor>("Num");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    framework::Tensor n_start;
-    framework::Tensor n_stop;
-    framework::Tensor n_num;
-    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
-    T start = n_start.data<T>()[0];
-    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
-    T stop = n_stop.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
-    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
-
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    double step = 0;
-    auto stream = context.cuda_device_context().stream();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
-                                                    out_data);
-    } else {
-      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
-                        ops::CUDALinspaceKernel<int32_t>,
-                        ops::CUDALinspaceKernel<int64_t>,
-                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
deleted file mode 100644
index ae51f1221cc09b433e784ecaf52da69e41fc3706..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/linspace_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CPULinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    T start = start_t.data<T>()[0];
-    T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    if (num > 1) {
-      // step should be of double type for all types
-      double step = (static_cast<double>(stop - start)) / (num - 1);
-      int half_num = num / 2;
-      for (int i = 0; i < num; ++i) {
-        if (i < half_num) {
-          out_data[i] = static_cast<T>(start + step * i);
-        } else {
-          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
-        }
-      }
-    } else {
-      out_data[0] = static_cast<T>(start);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index df4d0ebbccd5e3fb4dd6131fb5fbcaa9056bd9d6..883e3597d8a31138a6ff1e4cfcb05a165eafc4a6 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,43 +24,6 @@ namespace operators {
 class LogLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime() ||
-        (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims, label_dims,
-          platform::errors::InvalidArgument(
-              "The dimensions of Input(Predicted) must be equal to the"
-              "dimensions of Input(Labels), but received dimensions of "
-              "Input(Predicted)"
-              "is [%s], received dimensions of Input(Labels) is [%s].",
-              pred_dims, label_dims));
-    }
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(Predicted) must be 2,"
-                          "But received dimensions of Input(Predicted)"
-                          "is [%d]",
-                          pred_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "Each row of Input(Predicted) contains a real value, "
-              "so the 2nd dimension of Input(X) must be 1,"
-              "But got [%d]",
-              pred_dims[1]));
-    }
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Predicted", "Loss");
-  }
 };
 
 template <typename AttrType>
@@ -145,17 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor,
+                            PD_INFER_META(phi::LogLossInferMeta));
 REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
                   ops::LogLossGradMaker<paddle::framework::OpDesc>,
-                  ops::LogLossGradMaker<paddle::imperative::OpBase>);
+                  ops::LogLossGradMaker<paddle::imperative::OpBase>,
+                  LogLossInferShapeFunctor);
 REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
deleted file mode 100644
index e7985ab810b138da62390fae29eb4a6cf638c897..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_loss_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* loss_out = ctx.Output<Tensor>("Loss");
-
-    loss_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
-        place, loss, prediction, label, epsilon);
-  }
-};
-
-template <typename DeviceContext, typename T, typename AttrType = T>
-class LogLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
-    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
-
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-
-    auto dl = EigenVector<T>::Flatten(*dloss);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    if (dpred) {
-      dpred->mutable_data<T>(ctx.GetPlace());
-      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
-          place, dx, dl, prediction, label, epsilon);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index 9775910bba5cf30096f395c20d9dff3b5b1e541f..f103a69707a214400bbe2734409df4d9de3902e8 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index b2e68e9870d3c4f240fe35a4cbec811aefbc13f1..aa5fdd86745d6932052347f3dc11b14e3d447d20 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -10,11 +10,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/log_loss_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 8770abdac838f63b0c9f3a95b1ac0283a80ecbf2..26b6ce43303d181c41b60cf36c229d00acb0e626 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -12,459 +12,43 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <limits>
 #include "paddle/fluid/operators/log_softmax_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/functors.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
 namespace operators {
 
-#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two)                \
-  case near_greater_power_of_two:                                            \
-    ComputeLogSoftmaxForwardInWarp<                                          \
-        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
-        dst, src, outer_size, dim_size);                                     \
-    break;
-
-template <typename T, int KernelWarpSize>
-__device__ __forceinline__ T WarpReduceSum(T value) {
-#pragma unroll
-  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
-    T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
-    value = value + sum_val;
-  }
-  return value;
-}
-
-template <typename T, int KernelWarpSize>
-__device__ __forceinline__ T WarpReduceMax(T value) {
-#pragma unroll
-  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
-    T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
-    value = max(value, max_val);
-  }
-  return value;
-}
-
-int GetNearGreaterPowerOfTwo(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) {
-    ++log2_value;
-  }
-  return 1 << log2_value;
-}
-
-template <typename T, typename AccT, int NearGreaterPowerOfTwo>
-__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
-                                               int batch_size,
-                                               int element_count) {
-  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
-  constexpr int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
-  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
-
-  int thread_in_warp_idx = threadIdx.x;
-
-  // 1.read data from global memory to registers
-  AccT elements[warp_iter];
-  // set effective_element_count as the num of elements when warps do effective
-  // work
-  // set effective_element_count as 0, when warps do ineffective work
-  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
-  for (int it = 0; it < warp_iter; ++it) {
-    int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      elements[it] =
-          static_cast<AccT>(src[batch_id * element_count + element_index]);
-    } else {
-      elements[it] = -std::numeric_limits<AccT>::infinity();
-    }
-  }
-
-  // 2.compute max_value. For each thread, loop all registers to find max
-  AccT max_value = elements[0];
-#pragma unroll
-  for (int it = 1; it < warp_iter; ++it) {
-    max_value = (max_value > elements[it]) ? max_value : elements[it];
-  }
-  max_value = WarpReduceMax<AccT, kernel_warp_size>(max_value);
-
-  // 3.For each warp, accumulate all thread registers
-  AccT sum = 0.0f;
-#pragma unroll
-  for (int it = 0; it < warp_iter; ++it) {
-    sum += std::exp(elements[it] - max_value);
-  }
-  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
-
-  // 4.store result.
-  sum = std::log(sum);
-#pragma unroll
-  for (int it = 0; it < warp_iter; ++it) {
-    int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      dst[batch_id * element_count + element_index] =
-          static_cast<T>(elements[it] - max_value - sum);
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename T, typename AccT>
-void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size,
-                                     int outer_size, gpuStream_t stream) {
-  int threads_per_block = 128;
-  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
-  int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  int warps_per_block = (threads_per_block / kernel_warp_size);
-  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
-  dim3 threads(kernel_warp_size, warps_per_block, 1);
-
-  switch (near_greater_power_of_two) {
-    LAUNCH_WARP_FORWAR_COMPUTE(1);
-    LAUNCH_WARP_FORWAR_COMPUTE(2);
-    LAUNCH_WARP_FORWAR_COMPUTE(4);     // dim_size: 3~4
-    LAUNCH_WARP_FORWAR_COMPUTE(8);     // dim_size: 5~8
-    LAUNCH_WARP_FORWAR_COMPUTE(16);    // dim_size: 9~16
-    LAUNCH_WARP_FORWAR_COMPUTE(32);    // dim_size: 17~32
-    LAUNCH_WARP_FORWAR_COMPUTE(64);    // dim_size: 33~64
-    LAUNCH_WARP_FORWAR_COMPUTE(128);   // dim_size 65~128
-    LAUNCH_WARP_FORWAR_COMPUTE(256);   // dim_size 129~256
-    LAUNCH_WARP_FORWAR_COMPUTE(512);   // dim_size 257~512
-    LAUNCH_WARP_FORWAR_COMPUTE(1024);  // dim_size 513~1024
-
-    default:
-      break;
-  }
-}
-
-// Returns the final item after reduce operation along block.x.
-// Firstly, get shared memory(smem) offset, find the starting position for every
-// y.
-// Secondly, initialise every smem position with value 'val' of thread itself.
-// Thirdly, apply standard reduction along x direction as below:
-//
-//   -> x direction
-// [o o o o o o o o]    time 0
-//  |     |/     /
-//  |    /|    /
-//  |  /  |  /
-//  |/    |/
-// [o o o o x x x x]    time 1
-//  | |/ /
-//  |/|/
-// [o o x x x x x x]    time 2
-//  |/
-// [o x x x x x x x]    time 3
-//
-// Finally, return the first item.
-// Imaging multiple reductions executed in paralell along y axis,
-// Note that when blockDim.x is not 1, it's a EVEN number in all cases,
-// and the size of shared memory is even as well.
-template <typename T, template <typename> class Functor>
-__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) {
-  Functor<T> func;
-  // This reduction is not Block-wise reduction, only reduce along block.x.
-  // therefore the shared mem has offsets for different block.y.
-  shared += threadIdx.y * blockDim.x;
-  shared[threadIdx.x] = val;
-  int offset = blockDim.x / 2;
-
-  while (offset > 0) {
-    __syncthreads();
-    if (threadIdx.x < offset) {
-      shared[threadIdx.x] =
-          func(shared[threadIdx.x], shared[threadIdx.x + offset]);
-    }
-    offset /= 2;
-  }
-  __syncthreads();
-  return shared[0];
-}
-
-template <typename T, typename AccT>
-__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis(
-    T *output, const T *input, int outer_size, int dim_size, int inner_size) {
-  extern __shared__ unsigned char smem[];
-  auto sdata = reinterpret_cast<AccT *>(smem);
-
-  const int outer_stride = inner_size * dim_size;
-  const int dim_stride = inner_size;
-
-  for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) {
-    for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size;
-         y_id += blockDim.y * gridDim.y) {
-      const int data_offset = x_id * outer_stride + y_id;
-      // When blockDim.x==1, no block.x-reduction opetaions are needed.
-      // And threadIdx.x is 0 all the time, so the for-loops below are literally
-      // loops (No parallel executions). Loop all elements along axis and
-      // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final
-      // log_softmax values along that axis.
-      // 1. reduce max
-      AccT max_value = -std::numeric_limits<AccT>::infinity();
-      // For one thread, iterate all items it responsable for, and get
-      // max_value.
-      // If there are N threads, N max_value will be returned.
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        const AccT value =
-            static_cast<AccT>(input[data_offset + d * dim_stride]);
-        max_value = phi::funcs::MaxFunctor<AccT>()(max_value, value);
-      }
-      // If there are more than 1 threads along block x, reduce all max_values
-      // and get the global max_value, which is the max value along "axis".
-      // If there is only one thread along block x, no need to reduce, as the
-      // 'max_value' is the global max_value.
-      if (blockDim.x > 1) {
-        max_value = BlockReduceAlongDimX<AccT, phi::funcs::MaxFunctor>(
-            sdata, max_value);
-      }
-
-      // 2. reduce sum
-      AccT sum = 0;
-      // Below is the same execution as '1. reduce max'
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        sum += std::exp(static_cast<AccT>(input[data_offset + d * dim_stride]) -
-                        max_value);
-      }
-      if (blockDim.x > 1) {
-        sum = BlockReduceAlongDimX<AccT, phi::funcs::AddFunctor>(sdata, sum);
-      }
-
-      // 3. input-max-log_sum and write to output
-      for (int d = threadIdx.x; d < dim_size; d += blockDim.x) {
-        output[data_offset + d * dim_stride] = static_cast<T>(
-            static_cast<AccT>(input[data_offset + d * dim_stride]) - max_value -
-            std::log(sum));
-      }
-    }
-  }
-}
-
-// block.y covers inner_size. Threads along the x axis process dim_size
-// elements, and make sure not to exceed the 1024 threads per block.
-// Note that dim_threads namely blockDim.x is either 1 or a even number.
-inline dim3 GetBlockSize(int dim_size, int inner_size) {
-  int inner_threads = inner_size;
-  inner_threads = std::min(inner_threads, 1024);
-  int dim_threads = 1;
-
-  while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) {
-    dim_threads *= 2;
-  }
-  dim_threads /= 2;
-  return dim3(dim_threads, inner_threads);
-}
-
-// First cover the y axis as many blocks as possible.
-// Then cover the x axis as many blocks as possible,
-// and make sure not to exceed the max_active_blocks.
-inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size,
-                        int dim_size, int inner_size) {
-  int inner_blocks = (inner_size + block.y - 1) / block.y;
-  if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks;
-
-  int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks;
-  if (outer_blocks > outer_size) outer_blocks = outer_size;
-  return dim3(outer_blocks, inner_blocks);
-}
-
-// When designing grid size and block size, priority is given to block size,
-// and grid will be determined according to the maximum number of active blocks,
-// which is set by as a experience value.
-template <typename T, typename Kernel>
-void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size,
-                            int inner_size, dim3 &grid, dim3 &block,
-                            int &shared_mem, int num_sm) {
-  block = GetBlockSize(dim_size, inner_size);
-  int block_threads = block.x * block.y;
-  shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T);
-  int max_active_blocks = num_sm * 2;
-  grid =
-      GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size);
-}
-
-template <typename T, typename MPDType>
-void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
-                                                  const T *input_data,
-                                                  int outer_size, int dim_size,
-                                                  int inner_size, int num_sm,
-                                                  gpuStream_t stream) {
-  int shared_mem;
-  dim3 grid;
-  dim3 block;
-
-  ComputeLaunchConfigure<MPDType>(
-      &LogSoftmaxForwardCUDAKernelNotLastAxis<T, MPDType>, outer_size, dim_size,
-      inner_size, grid, block, shared_mem, num_sm);
-
-  LogSoftmaxForwardCUDAKernelNotLastAxis<
-      T, MPDType><<<grid, block, shared_mem, stream>>>(
-      output_data, input_data, outer_size, dim_size, inner_size);
-}
+using Tensor = framework::Tensor;
 
 template <typename T>
 class LogSoftmaxKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
-
  public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *x = context.Input<framework::Tensor>("X");
-    auto *out = context.Output<framework::Tensor>("Out");
-    const auto *input_data = x->data<T>();
-    auto *output_data = out->mutable_data<T>(context.GetPlace());
-
-    const int rank = x->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
 
-    int dim_size = x->dims()[axis];
-    int inner_size = 1;
-    for (int i = axis + 1; i < x->dims().size(); ++i) {
-      inner_size *= x->dims()[i];
-    }
-    int outer_size = SizeToAxis(axis, x->dims());
-    gpuStream_t stream = context.cuda_device_context().stream();
-    int num_sm = context.cuda_device_context().GetSMCount();
-
-    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
-      LaunchSoftmaxForwardForLastAxis<T, MPDType>(output_data, input_data,
-                                                  dim_size, outer_size, stream);
-    } else {
-      LaunchLogSoftmaxForwardCUDAKernelNotLastAxis<T, MPDType>(
-          output_data, input_data, outer_size, dim_size, inner_size, num_sm,
-          stream);
-    }
+    int input_axis = ctx.Attr<int>("axis");
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    phi::SoftmaxForwardCUDAKernelDriver<T, true>(dev_ctx, *x, input_axis, out);
   }
 };
 
-// Backward below
-#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two)              \
-  case near_greater_power_of_two:                                            \
-    ComputeLogSoftmaxBackwardInWarp<                                         \
-        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
-        output, grad_output, grad_input, outer_size, dim_size);              \
-    break;
-
-template <typename T, typename AccT, int NearGreaterPowerOfTwo>
-__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
-                                                const T *grad_output,
-                                                T *grad_input, int batch_size,
-                                                int element_count) {
-  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
-  constexpr int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
-  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
-
-  int thread_in_warp_idx = threadIdx.x;
-
-  // 1.read data from global memory to registers
-  AccT output_register[warp_iter];
-  AccT grad_output_register[warp_iter];
-  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
-  for (int iter = 0; iter < warp_iter; ++iter) {
-    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      output_register[iter] =
-          static_cast<AccT>(output[batch_id * element_count + element_index]);
-      grad_output_register[iter] = static_cast<AccT>(
-          grad_output[batch_id * element_count + element_index]);
-    } else {
-      output_register[iter] = static_cast<AccT>(0);
-      grad_output_register[iter] = static_cast<AccT>(0);
-    }
-  }
-
-  // 2. For each warp, accumulate all thread registers
-  AccT sum = grad_output_register[0];
-#pragma unroll
-  for (int iter = 1; iter < warp_iter; ++iter) {
-    sum += grad_output_register[iter];
-  }
-  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
-
-// 3. write result in grad_input
-#pragma unroll
-  for (int iter = 0; iter < warp_iter; ++iter) {
-    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < effective_element_count) {
-      grad_input[batch_id * element_count + element_index] = static_cast<T>(
-          (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
-    }
-  }
-}
-
-template <typename T, typename AccT>
-void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
-                                      const T *output, int dim_size,
-                                      int outer_size, gpuStream_t stream) {
-  int threads_per_block = 128;
-  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
-  int kernel_warp_size =
-      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
-  int warps_per_block = (threads_per_block / kernel_warp_size);
-  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
-  dim3 threads(kernel_warp_size, warps_per_block, 1);
-
-  switch (near_greater_power_of_two) {
-    LAUNCH_WARP_BACKWARD_COMPUTE(1);     // dim_size: 1
-    LAUNCH_WARP_BACKWARD_COMPUTE(2);     // dim_size: 2
-    LAUNCH_WARP_BACKWARD_COMPUTE(4);     // dim_size: 3~4
-    LAUNCH_WARP_BACKWARD_COMPUTE(8);     // dim_size: 5~8
-    LAUNCH_WARP_BACKWARD_COMPUTE(16);    // dim_size: 9~16
-    LAUNCH_WARP_BACKWARD_COMPUTE(32);    // dim_size: 17~32
-    LAUNCH_WARP_BACKWARD_COMPUTE(64);    // dim_size: 33~64
-    LAUNCH_WARP_BACKWARD_COMPUTE(128);   // dim_size: 65~128
-    LAUNCH_WARP_BACKWARD_COMPUTE(256);   // dim_size: 129~256
-    LAUNCH_WARP_BACKWARD_COMPUTE(512);   // dim_size: 257~512
-    LAUNCH_WARP_BACKWARD_COMPUTE(1024);  // dim_size: 513~1024
-
-    default:
-      break;
-  }
-}
-
 template <typename T>
 class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
-
  public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const auto *out = context.Input<framework::Tensor>("Out");
-    const auto *d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *out = ctx.Input<Tensor>("Out");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
 
-    const auto *out_data = out->data<T>();
-    const auto *d_out_data = d_out->data<T>();
-    auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int rank = out->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    int dim_size = out->dims()[axis];
-    int inner_size = 1;
-    for (int i = axis + 1; i < out->dims().size(); ++i) {
-      inner_size *= out->dims()[i];
-    }
-    int outer_size = SizeToAxis(axis, out->dims());
-    gpuStream_t stream = context.cuda_device_context().stream();
-
-    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
-      LaunchSoftmaxBackwardForLastAxis<T, MPDType>(
-          d_x_data, d_out_data, out_data, dim_size, outer_size, stream);
-    } else {
-      LogSoftmaxGradFunctor<platform::CUDADeviceContext, T>()(
-          context.template device_context<platform::CUDADeviceContext>(), out,
-          d_out, d_x, axis);
-    }
+    int input_axis = ctx.Attr<int>("axis");
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    phi::SoftmaxBackwardCUDAKernelDriver<T, true>(dev_ctx, *out, *dout,
+                                                  input_axis, dx);
   }
 };
 
@@ -473,6 +57,17 @@ class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_CUDA_KERNEL(
+    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(
+    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
@@ -483,3 +78,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
+#endif
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index 92c9857f0b942f00c348a6199ea4b9789b398328..10e2867bf2953f5c6fbc3d50bd8156fa3b0266e9 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -17,9 +17,11 @@
 
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/lstsq_op.h"
 #include "paddle/fluid/operators/qr_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
     Tensor tau = dito.Fill(tau_dims_vec, 0);
     auto tau_data = tau.mutable_data<T>(context.GetPlace());
 
+    using Context =
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
+    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
+
     if (m >= n) {
       Tensor tmp_x = dito.Transpose(new_x);
       Tensor tmp_y = dito.Transpose(new_y);
@@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
       Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn});
 
       // Step 3, solve R X = Y
-      triangular_solve<DeviceContext, T>(dev_ctx, res_r, slice_y, solution,
-                                         true, false, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, res_r, slice_y, true,
+                                             false, false, solution);
+
     } else {
       auto x_data = new_x.mutable_data<T>(context.GetPlace());
       auto y_data = new_y.mutable_data<T>(context.GetPlace());
@@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
 
       // Step 2, solve R^H Z = Y
       Tensor trans_r = dito.Transpose(new_x);
-      triangular_solve<DeviceContext, T>(dev_ctx, trans_r, new_y, solution,
-                                         true, true, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, trans_r, new_y, true,
+                                             true, false, solution);
 
       // Step 3, X <- Q Z
       BatchedOrgqr<DeviceContext, T>(dev_ctx, batch_count, n, n, min_mn, x_data,
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index 3cbbc62e7bec92f329535e788f19d439c9341a0e..520722dafcbea3ce8c545389317516cc22f7689f 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -22,7 +22,6 @@
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index f323e2e041d994eb01c9d4e934984b8a005ffcec..214b2eccae9f75e9bfcfa3df0b823918e2b0c353 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -15,12 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -555,6 +556,11 @@ class LUGradKernel : public framework::OpKernel<T> {
 
     framework::Tensor Pmat;
     Unpack_Pivot<DeviceContext, T>(dev_ctx, *P, &Pmat, m, k);
+
+    using Context =
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
+    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
+
     if (m <= n) {
       if (k < n) {
         framework::Tensor U_complement, U_grad_complement, phi_complement,
@@ -605,8 +611,9 @@ class LUGradKernel : public framework::OpKernel<T> {
       framework::Tensor psi_principal, phi_mH, psi_tmp;
       Tensor_Conj<DeviceContext, T>(dev_ctx, phi, &phi_mH);
       phi_mH = helper.Transpose(phi_mH);
-      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow, phi_mH,
-                                         &psi_principal, true, false, false);
+
+      phi::TriangularSolveKernel<T, Context>(
+          phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal);
 
       Tensor_Conj<DeviceContext, T>(dev_ctx, psi_principal, &psi_principal);
       psi_principal = helper.Transpose(psi_principal);
@@ -620,8 +627,9 @@ class LUGradKernel : public framework::OpKernel<T> {
       SetValueCompute_dispatch<DeviceContext, T>(ctx, &psi, &psi_principal,
                                                  &psi, axes, &slice_starts,
                                                  &slice_ends, valuedims, xrank);
-      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp,
-                                         true, false, true);
+
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, L_narrow_mH, psi,
+                                             true, false, true, &psi_tmp);
 
       auto mat_dim_p =
           phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
@@ -672,8 +680,10 @@ class LUGradKernel : public framework::OpKernel<T> {
                                                  &psi, axes, &slice_starts,
                                                  &slice_ends, valuedims, xrank);
       framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH;
-      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, phi,
-                                         &psi_principal, true, false, true);
+
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, L_narrow_mH, phi,
+                                             true, false, true, &psi_principal);
+
       slice_starts[0] = 0;
       slice_starts[1] = 0;
       slice_ends[0] = k;
@@ -695,8 +705,8 @@ class LUGradKernel : public framework::OpKernel<T> {
       psi_tmp = helper.Transpose(psi_tmp);
 
       Tensor_Conj<DeviceContext, T>(dev_ctx, U_narrow, &U_narrow_mH);
-      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow_mH, psi_tmp, &psi,
-                                         true, false, false);
+      phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, U_narrow_mH, psi_tmp,
+                                             true, false, false, &psi);
       *dx = helper.Transpose(psi);
     }
   }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index ba047355ad7e0e7991e841cecd79e7e0b03c5911..af1069cb867993160d7346779d7de8161e37438c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -5,6 +5,8 @@ endif()
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
 math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
+elseif (WITH_MLU)
+math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
 math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
@@ -18,7 +20,6 @@ math_library(sampler DEPS generator)
 
 # math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
-math_library(pooling)
 
 if(WITH_MKLDNN)
     math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
@@ -44,8 +45,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(matrix_inverse)
-math_library(segment_pooling)
 math_library(matrix_solve)
 
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
@@ -70,7 +69,6 @@ if(WITH_GPU AND (NOT WITH_ROCM))
     endif()
 endif()
 
-cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if(WITH_TESTING AND TEST im2col_test)
     set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
 endif()
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 46126ac59c892787d2f63956983404843e518ae7..c9308d27c0a3490d9c0094f45a1a9c2d894bbf57 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
@@ -226,6 +229,90 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
 };
 #endif
 
+#ifdef PADDLE_WITH_MLU
+template <typename T>
+class ConcatFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, int axis,
+                  framework::Tensor* output) {
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto ins_size = input.size();
+
+    const int axis_t = axis;
+    const int ins_size_t = ins_size;
+    auto place = context.GetPlace();
+    output->mutable_data<T>(place);
+
+    // mlu should do sth
+    // init ins tensors
+    std::vector<const void*> inputs;
+    std::vector<MLUCnnlTensorDesc> input_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < ins_size; i++) {
+      input_descs.emplace_back(MLUCnnlTensorDesc(
+          input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
+      desc_vector.push_back(input_descs.back().get());
+      inputs.push_back(input[i].data());
+    }
+    // init out tensors
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(),
+                    inputs.data(), output_desc.get(), GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class SplitFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
+    if (input.numel() == 0) {
+      return;
+    }
+
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto in_dims = input.dims();
+    auto out_size = outputs->size();
+
+    std::vector<framework::DDim> outs_dims(out_size, in_dims);
+    for (size_t i = 0; i < out_size; ++i) {
+      outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
+    }
+
+    // init out tensors
+    std::vector<void*> vct_tensor;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < out_size; i++) {
+      (*outputs)[i]->Resize(outs_dims[i]);
+      (*outputs)[i]->mutable_data<T>(context.GetPlace());
+      output_descs.emplace_back(
+          MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
+                            ToCnnlDataType((*outputs)[i]->dtype())));
+      desc_vector.push_back(output_descs.back().get());
+      vct_tensor.push_back(GetBasePtr((*outputs)[i]));
+    }
+    // init in tensors
+    MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input.dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(),
+                   desc_vector.data(), vct_tensor.data());
+  }
+};
+#endif
+
 #define DEFINE_FUNCTOR(type)                                      \
   template class ConcatFunctor<platform::CPUDeviceContext, type>; \
   template class SplitFunctor<platform::CPUDeviceContext, type>;
@@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float)
 FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
 #endif
 
+#ifdef PADDLE_WITH_MLU
+#define DEFINE_MLU_FUNCTOR(type)                                  \
+  template class ConcatFunctor<platform::MLUDeviceContext, type>; \
+  template class SplitFunctor<platform::MLUDeviceContext, type>;
+DEFINE_MLU_FUNCTOR(float)
+DEFINE_MLU_FUNCTOR(platform::float16)
+DEFINE_MLU_FUNCTOR(int64_t)
+DEFINE_MLU_FUNCTOR(bool)
+DEFINE_MLU_FUNCTOR(int)
+DEFINE_MLU_FUNCTOR(int8_t)
+DEFINE_MLU_FUNCTOR(int16_t)
+DEFINE_MLU_FUNCTOR(uint8_t)
+#endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
deleted file mode 100644
index e41f0aedf39ef582b4533b1eeb6ccda1e8ed7e49..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using DataLayout = framework::DataLayout;
-
-/*
- * \brief Compute the depthwise convolution which include
- * forward process and backpropagation process
- */
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvInputGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* input_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFilterGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 9994ccc10cb13b2f692b18f16182f6bcdad7efa5..b77e23450360c836ae3efe0a6dc2c77216e660f0 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -34,10 +34,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename InputIterator, typename OutputIterator, typename BinaryOp>
+template <typename InputIterator, typename OutputIterator, typename BinaryOp,
+          typename Context>
 static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter,
-                             size_t n, BinaryOp op,
-                             const platform::CUDADeviceContext &dev_ctx) {
+                             size_t n, BinaryOp op, const Context &dev_ctx) {
   memory::AllocationPtr allocation;
   void *temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
@@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
                                   size_t inner_dim, T init, BinaryOp op,
-                                  bool reverse,
-                                  const platform::CUDADeviceContext &dev_ctx) {
+                                  bool reverse, const Context &dev_ctx) {
   constexpr size_t kThreadNumX = 16;
   constexpr size_t kThreadNumY = 32;
 
@@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
                    size_t inner_dim, T init, BinaryOp op, bool reverse,
-                   const platform::CUDADeviceContext &dev_ctx) {
+                   const Context &dev_ctx) {
   if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
 
   if (outer_dim == 1 && inner_dim == 1) {
@@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
       CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
     }
   } else if (inner_dim != 1) {
-    platform::ForRange<platform::CUDADeviceContext> for_range(
-        dev_ctx, outer_dim * inner_dim);
+    platform::ForRange<Context> for_range(dev_ctx, outer_dim * inner_dim);
     if (reverse) {
       for_range(
           InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
deleted file mode 100644
index 1b36e615c68df814015a2c308ed74b755f6bc635..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "Eigen/Core"
-#include "Eigen/LU"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
-  }
-};
-
-template class MatrixInverseFunctor<platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
deleted file mode 100644
index 41335a69417a94a567119bb8f37378af957be541..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class MatrixInverseFunctor;
-
-template <typename T>
-class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-#ifndef PADDLE_WITH_HIP
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    memory::allocation::AllocationPtr tmp_gpu_mat_data;
-    const T* gpu_mat = a.data<T>();
-    if (n >= 32) {
-      // Copy all elements of input matrix A to a temporary memory space to
-      // avoid being overriden by getrf.
-      tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
-      memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(),
-                   context.GetPlace(), a.data(), a.numel() * sizeof(T),
-                   context.stream());
-      gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
-    }
-
-    std::vector<const T*> cpu_ptrs(batch_size * 2);
-    for (int i = 0; i < batch_size; ++i) {
-      cpu_ptrs[i] = gpu_mat + i * n * n;
-      cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
-    }
-
-    // Copy the addresses of A and A_inv from host to device.
-    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                 platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                 cpu_ptrs.size() * sizeof(T*), context.stream());
-    T** gpu_inv_ptrs =
-        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-    // Allocate device memory for info and pivots.
-    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-    memory::allocation::AllocationPtr tmp_gpu_info_data =
-        memory::Alloc(context, num_ints * sizeof(int));
-    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-
-    std::vector<int> info;  // only for singular checking
-    info.resize(batch_size);
-    // This functions in cuBLAS is intended to be used for matrices of small
-    // sizes where the launch overhead is a significant factor.
-    // TODO(Xreki): call function in cusolver for large matrices.
-    if (n < 32) {
-      // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
-      // plus cublas<S/D>getriBatched.
-      // However it only works if N is less than 32. If not, we need to
-      // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
-      blas.BatchedMatInv(n,
-                         reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                         gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    } else {
-      // This function performs the LU factorization of each matrix A by the
-      // equation P * A = L * U. L and U are written back to original matrix A,
-      // and diagonal elements of L are discarded.
-      int* gpu_pivot_ptr =
-          reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
-      blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_info_ptr, batch_size);
-
-      blas.BatchedGETRI(n,
-                        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    }
-    memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(),
-                 gpu_info_ptr, sizeof(int) * batch_size, context.stream());
-    for (int i = 0; i < batch_size; ++i) {
-      PADDLE_ENFORCE_EQ(info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U. "
-                            "Please check the matrix value and change it to a "
-                            "non-singular matrix",
-                            i, info[i], info[i]));
-    }
-#else
-    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
-#endif
-  }
-};
-
-template class MatrixInverseFunctor<platform::CUDADeviceContext, float>;
-template class MatrixInverseFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
index 883ee9b148654f8621b26942739730426ba7fc7d..7b239b8166644697581d0051f12b6abacc6832fa 100644
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -34,45 +34,6 @@ class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
 template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
 template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
 
-template <typename T>
-class TriangularSolveFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor* a, framework::Tensor* b, bool left,
-                  bool upper, bool transpose, bool unitriangular) {
-    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
-    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
-    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
-    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
-
-    const T* a_data = a->data<T>();
-    T* b_data = b->mutable_data<T>(context.GetPlace());
-
-    int a_dim_size = a->dims().size();
-    int b_dim_size = b->dims().size();
-
-    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
-    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
-    auto lda = left ? std::max(1, M) : std::max(1, N);
-    auto ldb = std::max(1, N);
-
-    int batch_size = 1;
-    auto& a_dim = a->dims();
-    for (int i = 0; i < a_dim_size - 2; i++) {
-      batch_size *= a_dim[i];
-    }
-
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (int i = 0; i < batch_size; i++) {
-      blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda,
-                b_data + i * N * M, ldb);
-    }
-  }
-};
-
-template class TriangularSolveFunctor<platform::CPUDeviceContext, float>;
-template class TriangularSolveFunctor<platform::CPUDeviceContext, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index d3490ead212731f3fc6a75d61a31c11c72c9129d..737196dde1dfc26269fe083fe17037c829ef8109 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -161,67 +161,6 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
 template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
 template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
 
-template <typename T>
-class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context, const Tensor* a,
-                  Tensor* b, bool left, bool upper, bool transpose,
-                  bool unitriangular) {
-    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
-    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
-    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
-    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
-
-    const T* a_data = a->data<T>();
-    T* b_data = b->mutable_data<T>(context.GetPlace());
-
-    int a_dim_size = a->dims().size();
-    int b_dim_size = b->dims().size();
-
-    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
-    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
-    auto lda = left ? std::max(1, M) : std::max(1, N);
-    auto ldb = std::max(1, N);
-
-    int batch_size = 1;
-    auto& a_dim = a->dims();
-    for (int i = 0; i < a_dim_size - 2; i++) {
-      batch_size *= a_dim[i];
-    }
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-    if (batch_size <= 8 && M >= 64) {
-      for (auto i = 0; i < batch_size; i++) {
-        blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
-                  a_data + i * M * M, lda, b_data + i * N * M, ldb);
-      }
-    } else {
-      std::vector<const T*> cpu_ptrs(batch_size * 2);
-      for (int i = 0; i < batch_size; ++i) {
-        cpu_ptrs[i] = a_data + i * M * M;
-        cpu_ptrs[i + batch_size] = b_data + i * M * N;
-      }
-
-      // Copy the addresses of A and tmp_b from host to device.
-      memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-          memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-      memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                   platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                   cpu_ptrs.size() * sizeof(T*), context.stream());
-
-      const T** gpu_a_ptrs =
-          reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
-      T** gpu_b_ptrs =
-          reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-      blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
-                       gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size);
-    }
-  }
-};
-
-template class TriangularSolveFunctor<platform::CUDADeviceContext, float>;
-template class TriangularSolveFunctor<platform::CUDADeviceContext, double>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
index 1dc43205592f69cc105b43fe49b2f7872f8251c3..415d0c6dd8e0cf51958783c32aa49c66cce9e15c 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -117,14 +117,6 @@ class MatrixSolveFunctor {
                   const framework::Tensor& b, framework::Tensor* out);
 };
 
-template <typename DeviceContext, typename T>
-class TriangularSolveFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor* a,
-                  framework::Tensor* b, bool left, bool upper, bool transpose,
-                  bool unitriangular);
-};
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 45556e97d1d7afb81d626c99b078cbc215c0195f..28ec3a871022f4b9ec4dce9d9310fd630f10e473 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -14,106 +14,107 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 // All tensors are in NCHW or NHWC format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          int input_idx, output_idx;
-          for (int ph = 0; ph < groups; ++ph) {
-            if (axis == 1) {
-              input_idx =
-                  (new_bindex + new_cindex) * groups + ph * fea_size + f;
-            } else {
-              input_idx = (new_bindex + f * output_channels + c) * groups + ph;
-            }
-            T x = input_data[input_idx];
-            ele = ele > x ? ele : x;
-          }
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+  int fea_size = input_height * input_width;
+  // c_size means the output size of each sample
+  int c_size = fea_size * output_channels;
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  for (int i = 0; i < batch_size; ++i) {
+    int new_bindex = c_size * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int new_cindex = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        T ele = static_cast<T>(-FLT_MAX);
+        int input_idx, output_idx;
+        for (int ph = 0; ph < groups; ++ph) {
           if (axis == 1) {
-            output_idx = new_bindex + new_cindex + f;
+            input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f;
           } else {
-            output_idx = new_bindex + f * output_channels + c;
+            input_idx = (new_bindex + f * output_channels + c) * groups + ph;
           }
-          output_data[output_idx] = ele;
+          T x = input_data[input_idx];
+          ele = ele > x ? ele : x;
         }
+        if (axis == 1) {
+          output_idx = new_bindex + new_cindex + f;
+        } else {
+          output_idx = new_bindex + f * output_channels + c;
+        }
+        output_data[output_idx] = ele;
       }
     }
   }
-};
+}
 
-template <class T>
-class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+  int fea_size = input_height * input_width;
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0, output_idx;
-          bool continue_match = true;
-          if (axis == 1) {
-            input_idx0 = (blen + clen) * groups + f;
-            output_idx = blen + clen + f;
-          } else {
-            input_idx0 = (blen + f * output_channels + c) * groups;
-            output_idx = blen + f * output_channels + c;
-          }
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int idx_offset = (axis == 1 ? fea_size * g : g);
-            int input_idx = input_idx0 + idx_offset;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
+  for (int i = 0; i < batch_size; ++i) {
+    int blen = fea_size * output_channels * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int clen = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        int input_idx0, output_idx;
+        bool continue_match = true;
+        if (axis == 1) {
+          input_idx0 = (blen + clen) * groups + f;
+          output_idx = blen + clen + f;
+        } else {
+          input_idx0 = (blen + f * output_channels + c) * groups;
+          output_idx = blen + f * output_channels + c;
+        }
+        for (int g = 0; g < groups && continue_match; ++g) {
+          int idx_offset = (axis == 1 ? fea_size * g : g);
+          int input_idx = input_idx0 + idx_offset;
+          if (input_data[input_idx] == output_data[output_idx]) {
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+            continue_match = false;
           }
         }
       }
     }
   }
-};
+}
 
 template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
 template class MaxOutFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::CPUContext, float>;
+template class MaxOutGradFunctor<phi::CPUContext, double>;
+template class MaxOutFunctor<phi::CPUContext, float>;
+template class MaxOutFunctor<phi::CPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 1856fb4eb48c73f96d4f6428ba890c821a61292c..1d0478db5ef4a80d955d1166ffa21ff39f6bd184 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = output->numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width, groups,
-        axis, output_data);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  int nthreads = output->numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, input_channels, input_height, input_width, groups,
+      axis, output_data);
+}
+
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = output.numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups, axis);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+  int nthreads = output.numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, output_data, output_grad_data, input_grad_data,
+      input_channels, input_height, input_width, groups, axis);
+}
 
 template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
@@ -157,6 +154,12 @@ template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 template class MaxOutFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::GPUContext, float>;
+template class MaxOutGradFunctor<phi::GPUContext, double>;
+
+template class MaxOutFunctor<phi::GPUContext, float>;
+template class MaxOutFunctor<phi::GPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 0d8372df8a2fec306f6091712c66d55d1e71216e..1f4964f7715426d2eab6168ae009ffbd40e1ff0a 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -30,7 +30,7 @@ class MaxOutFunctor {
                   const int axis = 1);
 };
 
-template <typename DeviceContext, class T>
+template <typename DeviceContext, typename T>
 class MaxOutGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
deleted file mode 100644
index dfd3dad38644b65ef0b5e62e1b54ce210e9c489a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/pooling.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) {}
-};
-
-template <class T>
-class AvgPool {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  MT intermediate_res;
-
- public:
-  DEVICE inline T initial() {
-    intermediate_res = static_cast<MT>(0.0f);
-    return static_cast<T>(0);
-  }
-
-  DEVICE inline void compute(const T& x, T* y) {
-    intermediate_res += static_cast<MT>(x);
-  }
-
-  DEVICE inline void finalize(const T& pool_field, T* y) {
-    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
-  }
-};
-
-template <class T>
-class MaxPoolGrad {
- public:
-  static constexpr bool use_x = true;
-  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                                 T* dx) {
-    *dx += dy * static_cast<T>(x == y);
-  }
-};
-
-template <class T>
-class AvgPoolGrad {
- public:
-  static constexpr bool use_x = false;
-  HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
-                                 T* dx) {
-    *dx += (scale * dy);
-  }
-};
-
-/* used for adaptive pool to calculate start and end index of each divided grid
- */
-HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      floor(static_cast<double>(ph * input_size) / output_size));
-}
-
-HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
-}
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
- * is the number of channels, H and W is the height and width of feature.
- * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
- * is the number of channels, D, H and W is the depth, height and width of
- * feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename PoolProcess, typename T>
-class Pool2dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, T* output, gpuStream_t stream,
-                  PoolProcess pool_compute);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
-                  PoolProcess pool_compute);
-
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool2dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, framework::Tensor* input_grad);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename PoolProcess, typename T>
-class Pool3dDirectCUDAFunctor {
- public:
-  void operator()(const T* input, const std::vector<int>& input_shape,
-                  const std::vector<int>& output_shape,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, T* output, gpuStream_t stream,
-                  PoolProcess pool_compute);
-};
-#endif
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, typename PoolProcess, typename T>
-class Pool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
-                  PoolProcess pool_compute);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_compute);
-};
-
-template <typename DeviceContext, class T>
-class MaxPool3dGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* input_grad);
-  // overload operator() to support argument data_format
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::string data_format, framework::Tensor* input_grad);
-};
-
-/*
- * \brief Getting max pooling results and corresponding max index, and
- * calculating gradient.
- * In up-sampling-pooling, it is necessary to know max element index.
- * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
- * NCDHW format.
- */
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask);
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor {
- public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 42bf1f471deb5238fdb34dcd9284972930305f58..bc5a589ed6fb137c5013253a65971dcf80d4ac72 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace platform {
 class CPUDeviceContext;
@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <class T>
+class Vol2ColFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    // changed
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx;
+            if (data_layout != DataLayout::kNHWC) {
+              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                            input_width +
+                        w_pad;
+            } else {
+              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                            input_channels +
+                        c_in;
+            }
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
 /*
  * vol = [input_channels,input_depth, input_height, input_width]
  * col =
@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <class T>
+class Col2VolFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx;
+              if (data_layout != DataLayout::kNHWC) {
+                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                              input_width +
+                          w_pad;
+              } else {
+                vol_idx =
+                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                        input_channels +
+                    cIm;
+              }
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
 template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Vol2ColFunctor<phi::CPUContext, float>;
+template class Vol2ColFunctor<phi::CPUContext, double>;
+
 template class Col2VolFunctor<platform::CPUDeviceContext, float>;
 template class Col2VolFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<phi::CPUContext, float>;
+template class Col2VolFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 788dbb2204109dd4f215730e4234e3fec8aef702..01fa01e3c6ed04c151f709dd5fbebe387c32bde3 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
                   ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
 
-DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
-                            PT_INFER_META(phi::GeneralBinaryGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
 REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad,
                   ops::MatMulV2OpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2OpDoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index 1524a50f1ac6d6afa67722bc5d1c16a581395bb2..87df75ac465042a0f7894abecb4be4c213e5d807 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
   auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
       ColumnMatrixFromVector(y_dims), 0, trans_y);
 
-  if (x_dims.size() == 3 && y_dims.size() <= 2) {
+  if (x_dims.size() >= 3 && y_dims.size() <= 2) {
     // if transpose_X is true, the transpose cost much time
     if (!trans_x) {
       mat_dim_a.height_ *= mat_dim_a.batch_size_;
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index c65af3129f3646163925be95b27b9fec25207f8c..cdf204628b638f877c92e35a8941487aa39b5427 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace operators {
@@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
deleted file mode 100644
index d2c67d80b4f5a562d47e56173ecf1ea2f99bff56..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matrix_power_op.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct IdentityMatrixFunctor {
-  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int row = index / m_ % m_;
-    const int col = index % m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
-                         const paddle::framework::ExecutionContext& ctx) {
-  const auto& x_dims = X->dims();
-  const int x_ndim = x_dims.size();
-  T* out_data = Out->mutable_data<T>(ctx.GetPlace());
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, X->numel());
-
-  if (n == 0) {
-    // Out = Identity Matrix
-    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
-    for_range(functor);
-    return;
-  }
-
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  if (new_n == 1) {
-    framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out);
-    return;
-  }
-
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (new_n == 2) {
-    // Out = newX * newX
-    Out->mutable_data<T>(ctx.GetPlace());
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 3) {
-    // Out = (newX * newX) * newX
-    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
-    // gemm operations must be computable independently; otherwise,
-    // undefined behavior is expected.
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 4) {
-    // Out = (newX * newX) * (newX * newX)
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  }
-
-  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
-  int bit = 0;
-  Tensor z = Tensor(X->dtype());
-  bool out_inited = false;
-  Tensor temp_out = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  Tensor temp_z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  while (new_n > 0) {
-    bit = new_n & 0x1;
-    new_n >>= 1;
-    if (z.IsInitialized()) {
-      blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                  &temp_z, static_cast<T>(0));
-      framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z);
-    } else {
-      z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-      framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z);
-    }
-    if (bit == 1) {
-      if (out_inited == true) {
-        blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                    &temp_out, static_cast<T>(0));
-        framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out);
-      } else {
-        framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out);
-        out_inited = true;
-      }
-    }
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    Tensor* Out = ctx.Output<Tensor>("Out");
-    int n = ctx.Attr<int>("n");
-
-    const auto& x_dims = X->dims();
-    const int x_ndim = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        x_dims[x_ndim - 2], x_dims[x_ndim - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) should be equal."
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            x_dims[x_ndim - 2], x_dims[x_ndim - 1]));
-
-    MatrixPowerFunction<DeviceContext, T>(X, n, Out, ctx);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
-                             const Tensor* dOut, const int n, Tensor* dX,
-                             const paddle::framework::ExecutionContext& ctx) {
-  dX->mutable_data<T>(ctx.GetPlace());
-  const auto& x_dims = X->dims();
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  if (n == 0) {
-    // \nabla X = O
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, dX, static_cast<T>(0));
-    return;
-  } else if (n == 1) {
-    // \nabla X = \nabla Out
-    framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX);
-    return;
-  }
-
-  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (n == -1) {
-    // \nabla X = Out^{T} * \nabla Out * Out^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast<T>(1), dX,
-                static_cast<T>(0));
-    return;
-  }
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  // Use chain rule blow to compute \nabla newX^{n}
-  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
-  // Note that newX^{0} can be omitted
-  std::vector<std::shared_ptr<Tensor>> tensor_list(new_n - 1);
-  tensor_list[0] = std::make_shared<Tensor>(new_x);
-  int index = 1;
-  while (index < new_n - 1) {
-    tensor_list[index] = std::make_shared<Tensor>(
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx));
-    blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc,
-                static_cast<T>(1), tensor_list[index].get(), static_cast<T>(0));
-    index++;
-  }
-
-  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
-  //                      * \nabla Out
-  //                      * (newX^{T}^{n - i - 1})
-  Tensor dx_new = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc,
-              static_cast<T>(1), &dx_new, static_cast<T>(0));
-  Tensor da_an_minus1 =
-      ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc,
-              static_cast<T>(1), &da_an_minus1, static_cast<T>(0));
-  blas.AXPY(X->numel(), static_cast<T>(1), da_an_minus1.data<T>(),
-            dx_new.data<T>());
-  int start = 0;
-  while (start < new_n - 2) {
-    Tensor a_da = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    Tensor a_da_a = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc,
-                static_cast<T>(1), &a_da, static_cast<T>(0));
-    blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start],
-                trans_desc, static_cast<T>(1), &a_da_a, static_cast<T>(0));
-    blas.AXPY(X->numel(), static_cast<T>(1), a_da_a.data<T>(),
-              dx_new.data<T>());
-    start++;
-  }
-
-  if (n > 0) {
-    // \nabla X = \nabla newX
-    framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX);
-  } else {
-    // \nabla X = newX^{T} * \nabla newX * newX^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast<T>(1),
-                dX, static_cast<T>(0));
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    const Tensor* Out = ctx.Input<Tensor>("Out");
-    const Tensor* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const int n = ctx.Attr<int>("n");
-    Tensor* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    MatrixPowerGradFunction<DeviceContext, T>(X, Out, dOut, n, dX, ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index 65599259e2237387ad0dd85b5a9772733e3d7a1a..e7d08b6597360bb0431add6ae63eb99f401c8ce0 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_rank_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -69,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel {
         std::vector<int> x_batch_dims_array(max_dim);
         std::vector<int> tol_dims_array(max_dim);
         std::vector<int> out_dims_array(max_dim);
-        GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(),
-                               tol_dims_array.data(), out_dims_array.data(),
-                               max_dim, axis);
+        phi::funcs::GetBroadcastDimsArrays(
+            dim_x_batch, dim_tol, x_batch_dims_array.data(),
+            tol_dims_array.data(), out_dims_array.data(), max_dim, axis);
         ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array));
       }
     } else {
@@ -114,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename T>
-void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches,
-                      int rows, int cols, int k) {
-  // Eigen::Matrix API need non-const pointer.
-  T* input = const_cast<T*>(x_data);
-  int stride = rows * cols;
-  for (int i = 0; i < batches; i++) {
-    auto m = Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
-        input + i * stride, rows, rows);
-    Eigen::SelfAdjointEigenSolver<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        eigen_solver(m);
-    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
-    for (int j = 0; j < k; j++) {
-      *(eigenvalues_data + i * k + j) = eigenvalues[j];
-    }
-  }
-}
-
-template <typename T>
-void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows,
-              int cols, int k) {
-  // Eigen::Matrix API need non-const pointer.
-  T* input = const_cast<T*>(x_data);
-  int stride = rows * cols;
-  Eigen::BDCSVD<
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-      svd;
-  for (int i = 0; i < batches; i++) {
-    auto m = Eigen::Map<
-        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
-        input + i * stride, rows, cols);
-    svd.compute(m);
-    auto res_s = svd.singularValues();
-    for (int j = 0; j < k; j++) {
-      eigenvalues_data[i * k + j] = res_s[j];
-    }
-  }
-}
-
-template <typename T>
-class MatrixRankCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* x_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<int64_t>(context.GetPlace());
-    bool hermitian = context.Attr<bool>("hermitian");
-
-    auto dim_x = x->dims();
-    auto dim_out = out->dims();
-    int rows = dim_x[dim_x.size() - 2];
-    int cols = dim_x[dim_x.size() - 1];
-    int k = std::min(rows, cols);
-    auto numel = x->numel();
-    int batches = numel / (rows * cols);
-
-    bool use_default_tol = context.Attr<bool>("use_default_tol");
-    const Tensor* atol_tensor = nullptr;
-    Tensor temp_tensor;
-    T rtol_T = 0;
-    if (use_default_tol) {
-      framework::TensorFromVector<T>(std::vector<T>{0},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
-    } else if (context.HasInput("TolTensor")) {
-      atol_tensor = context.Input<Tensor>("TolTensor");
-    } else {
-      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-    }
-
-    Tensor eigenvalue_tensor;
-    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
-        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
-    if (hermitian) {
-      BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
-    } else {
-      BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
-    }
-
-    auto dito_T =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
-            context);
-    std::vector<int> max_eigenvalue_shape =
-        phi::vectorize<int>(detail::RemoveLastDim(eigenvalue_tensor.dims()));
-    Tensor max_eigenvalue_tensor =
-        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
-
-    Tensor temp_rtol_tensor;
-    framework::TensorFromVector<T>(std::vector<T>{rtol_T}, &temp_rtol_tensor);
-    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
-    Tensor tol_tensor;
-    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
-    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CPUDeviceContext,
-                         T, T>(context, atol_tensor, &rtol_tensor, -1,
-                               GreaterElementFunctor<T>(), &tol_tensor);
-
-    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
-
-    Tensor compare_result;
-    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
-                                         context.GetPlace());
-
-    int axis = -1;
-    if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
-      ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
-                           platform::CPUDeviceContext, T, int>(
-          context, &eigenvalue_tensor, &tol_tensor, axis,
-          GreaterThanFunctor<T, int64_t>(), &compare_result);
-    } else {
-      ElementwiseComputeEx<LessThanFunctor<T, int64_t>,
-                           platform::CPUDeviceContext, T, int>(
-          context, &eigenvalue_tensor, &tol_tensor, axis,
-          LessThanFunctor<T, int64_t>(), &compare_result);
-    }
-    auto dito_int =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext,
-                                                 int64_t>(context);
-    std::vector<int> result_shape = phi::vectorize<int>(dim_out);
-    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
-    out->ShareDataWith(result);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker);
-
-REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel<float>,
-                       ops::MatrixRankCPUKernel<double>);
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
deleted file mode 100644
index b1800c9c0c9f8b68f7f62de42943f7a425fa0ddb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/matrix_rank_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-DDim GetUDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
-  x_vec[x_vec.size() - 1] = k;
-  return phi::make_ddim(x_vec);
-}
-
-DDim GetVHDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
-  x_vec[x_vec.size() - 2] = k;
-  return phi::make_ddim(x_vec);
-}
-}  // namespace detail
-
-template <typename T>
-class MatrixRankGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* x_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<int64_t>(context.GetPlace());
-    bool hermitian = context.Attr<bool>("hermitian");
-
-    auto dim_x = x->dims();
-    auto dim_out = out->dims();
-    int rows = dim_x[dim_x.size() - 2];
-    int cols = dim_x[dim_x.size() - 1];
-    int k = std::min(rows, cols);
-    auto numel = x->numel();
-    int batches = numel / (rows * cols);
-
-    bool use_default_tol = context.Attr<bool>("use_default_tol");
-    const Tensor* atol_tensor = nullptr;
-    Tensor temp_tensor;
-    T rtol_T = 0;
-    if (use_default_tol) {
-      framework::TensorFromVector<T>(std::vector<T>{0},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-      rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
-    } else if (context.HasInput("TolTensor")) {
-      atol_tensor = context.Input<Tensor>("TolTensor");
-    } else {
-      framework::TensorFromVector<T>(std::vector<T>{context.Attr<float>("tol")},
-                                     context.device_context(), &temp_tensor);
-      atol_tensor = &temp_tensor;
-    }
-
-    // Must Copy X once, because the gesvdj will destory the content when exit.
-    Tensor x_tmp;
-    paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp);
-    auto info = memory::Alloc(dev_ctx, sizeof(int) * batches);
-    int* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-    Tensor eigenvalue_tensor;
-    auto* eigenvalue_data = eigenvalue_tensor.mutable_data<T>(
-        detail::GetEigenvalueDim(dim_x, k), context.GetPlace());
-    if (hermitian) {
-      SyevjBatched(dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data,
-                   info_ptr);
-      platform::ForRange<platform::CUDADeviceContext> for_range(
-          dev_ctx, eigenvalue_tensor.numel());
-      phi::funcs::AbsFunctor<T> functor(eigenvalue_data, eigenvalue_data,
-                                        eigenvalue_tensor.numel());
-      for_range(functor);
-    } else {
-      Tensor U, VH;
-      auto* u_data =
-          U.mutable_data<T>(detail::GetUDDim(dim_x, k), context.GetPlace());
-      auto* vh_data =
-          VH.mutable_data<T>(detail::GetVHDDim(dim_x, k), context.GetPlace());
-      GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data<T>(), vh_data,
-                    u_data, eigenvalue_data, info_ptr, 1);
-    }
-
-    auto dito_T =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(context);
-    std::vector<int> max_eigenvalue_shape =
-        phi::vectorize<int>(detail::RemoveLastDim(eigenvalue_tensor.dims()));
-    Tensor max_eigenvalue_tensor =
-        dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape);
-    Tensor temp_rtol_tensor;
-    framework::TensorFromVector<T>(std::vector<T>{rtol_T},
-                                   context.device_context(), &temp_rtol_tensor);
-    Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor);
-    Tensor tol_tensor;
-    tol_tensor.mutable_data<T>(dim_out, context.GetPlace());
-    ElementwiseComputeEx<GreaterElementFunctor<T>, platform::CUDADeviceContext,
-                         T, T>(context, atol_tensor, &rtol_tensor, -1,
-                               GreaterElementFunctor<T>(), &tol_tensor);
-
-    tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
-
-    Tensor compare_result;
-    compare_result.mutable_data<int64_t>(detail::NewAxisDim(dim_out, k),
-                                         context.GetPlace());
-    int axis = -1;
-    ElementwiseComputeEx<GreaterThanFunctor<T, int64_t>,
-                         platform::CUDADeviceContext, T, int64_t>(
-        context, &eigenvalue_tensor, &tol_tensor, axis,
-        GreaterThanFunctor<T, int64_t>(), &compare_result);
-    auto dito_int =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 int64_t>(context);
-    std::vector<int> result_shape = phi::vectorize<int>(dim_out);
-    Tensor result = dito_int.ReduceSum(compare_result, result_shape);
-    out->ShareDataWith(result);
-  }
-
-  void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
-                     int m, int n, int k, T* A, T* U, T* V, T* S, int* info,
-                     int thin_UV = 1) const;
-
-  void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize,
-                    int n, T* A, T* W, int* info) const;
-};
-
-template <>
-void MatrixRankGPUKernel<float>::GesvdjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
-    int k, float* A, float* U, float* V, float* S, int* info,
-    int thin_UV) const {
-  // do not compute singular vectors
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
-      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
-      gesvdj_params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
-        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
-        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
-        info, gesvdj_params));
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void MatrixRankGPUKernel<double>::GesvdjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n,
-    int k, double* A, double* U, double* V, double* S, int* info,
-    int thin_UV) const {
-  // do not compute singular vectors
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  gesvdjInfo_t gesvdj_params = NULL;
-  int lda = m;
-  int ldu = m;
-  int ldt = n;
-  int lwork = 0;
-  auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
-      handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
-      gesvdj_params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-  int stride_A = lda * n;
-  int stride_U = ldu * (thin_UV ? k : m);
-  int stride_V = ldt * (thin_UV ? k : n);
-  for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
-        handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
-        U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
-        info, gesvdj_params));
-    // check the error info
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
-}
-
-template <>
-void MatrixRankGPUKernel<float>::SyevjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A,
-    float* W, int* info) const {
-  auto handle = dev_ctx.cusolver_dn_handle();
-  // Compute eigenvalues only
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  // matrix is saved as column-major in cusolver.
-  // numpy and torch use lower triangle to compute eigenvalues, so here use
-  // upper triangle
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
-  int lda = n;
-  int stride_A = lda * n;
-  int lwork = 0;
-  syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
-      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
-        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
-        lwork, info, params));
-
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
-            error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroySyevjInfo(params));
-}
-
-template <>
-void MatrixRankGPUKernel<double>::SyevjBatched(
-    const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A,
-    double* W, int* info) const {
-  auto handle = dev_ctx.cusolver_dn_handle();
-  // Compute eigenvalues only
-  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
-  //  upper triangle of A is stored
-  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
-  int lda = n;
-  int stride_A = lda * n;
-  int lwork = 0;
-  syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
-      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
-
-  for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj(
-        handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
-        lwork, info, params));
-    int error_info;
-    memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info,
-                 sizeof(int), dev_ctx.stream());
-    PADDLE_ENFORCE_EQ(
-        error_info, 0,
-        platform::errors::PreconditionNotMet(
-            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
-            error_info));
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cusolverDnDestroySyevjInfo(params));
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel<float>,
-                        ops::MatrixRankGPUKernel<double>);
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index bd9ebd29777def2fafca648ad80bc57bef8df316..e55369e0691ee5e36da76c53c6dd5d13288231f4 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -12,14 +12,14 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/fluid/operators/maxout_op.h"
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -130,10 +130,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
 REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
deleted file mode 100644
index 922998293943ed5ee1ebcd08b5bcd93467496cb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/maxout_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MaxOutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
-    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
-                   groups, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MaxOutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
-      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups,
-                      axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 3692ace8bb5a46b06bd10a07a5d5d95d8825bdc6..32ef052119883944abc1876f8bf3a8c028ddc57a 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::NotFound("Input (Out) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true,
-                      platform::errors::NotFound(
-                          "Input (Indices) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true,
-                      platform::errors::NotFound(
-                          "Input (Label) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true,
-                      platform::errors::NotFound(
-                          "Output (Accuracy) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true,
-                      platform::errors::NotFound(
-                          "Output (Correct) of AccuracyOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true,
-                      platform::errors::NotFound(
-                          "Output (Total) of AccuracyOp is not found."));
-
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy",
-                   "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy");
-    OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy");
-
-    auto inference_dim = ctx->GetInputDim("Out");
-    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape as inference, because
-    // it's the output of topk.
-
-    PADDLE_ENFORCE_EQ(
-        label_dim.size(), 2,
-        platform::errors::InvalidArgument(
-            "ShapeError: label's dimensions of AccuracyOp must be 2. "
-            "But received label's dimensions = %d, label's shape = [%s]",
-            label_dim.size(), label_dim));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: label's second dimension of "
-                            "AccuracyOp must be 1. But received label's "
-                            "second dimension is = %d, label's shape = [%s]",
-                            label_dim[1], label_dim));
-      PADDLE_ENFORCE_EQ(
-          inference_dim[0], label_dim[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: the output's num_rows of AccuracyOp must be"
-              " the same as label's num_rows. But received output's "
-              "shape = [%s], label's shape = [%s], output's num_rows = %d, "
-              "label's "
-              "num_rows = %d",
-              inference_dim, label_dim, inference_dim[0], label_dim[0]));
-    }
-
-    ctx->SetOutputDim("Accuracy", {1});
-    ctx->SetOutputDim("Correct", {1});
-    ctx->SetOutputDim("Total", {1});
-    ctx->ShareLoD("Out", /*->*/ "Accuracy");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -123,13 +62,13 @@ with the input Out(Inference).
 }  // namespace operators
 }  // namespace paddle
 
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
+DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor,
+                            PD_INFER_META(phi::AccuracyInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int.
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AccuracyInferShapeFunctor);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
deleted file mode 100644
index 6f19100fa9d37e2efedad60a982bf19b09cac736..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/reduce.h>
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D,
-                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy, int* total_data) {
-  int count = 0;
-  __shared__ int total[BlockSize];
-
-  // support only 1 block
-  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-    for (int j = 0; j < D; ++j) {
-      if (Xdata[i * D + j] == labeldata[i]) {
-        ++count;
-        break;
-      }
-    }
-  }
-  total[threadIdx.x] = count;
-  __syncthreads();
-
-// reduce the count with init value 0, and output accuracy.
-#ifdef PADDLE_WITH_CUDA
-  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
-#else
-  // HIP thrust::reduce not support __device__
-  for (int s = BlockSize / 2; s > 0; s >>= 1) {
-    if (threadIdx.x < s) {
-      total[threadIdx.x] += total[threadIdx.x + s];
-    }
-    __syncthreads();
-  }
-  int result = total[0];
-#endif
-  if (threadIdx.x == 0) {
-    *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
-    *total_data = N;
-  }
-}
-
-template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-    // FIXME(typhoonzero): only support indices currently
-    // if add support for output values, how to detect the data type?
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    int num_samples = static_cast<int>(inference->dims()[0]);
-    size_t infer_width = inference->dims()[1];
-    auto stream = ctx.cuda_device_context().stream();
-    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    AccuracyCudaKernel<
-        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data, total_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-// FIXME(typhoonzero): types of T is for inference data.
-// label data is always int64
-REGISTER_OP_CUDA_KERNEL(
-    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-    paddle::operators::AccuracyOpCUDAKernel<double>,
-    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h
deleted file mode 100644
index 94e5bf8257e67b9fd01aa9ae45a25d90963fef13..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AccuracyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    size_t num_samples = inference->dims()[0];
-    size_t class_dim = inference->dims()[1];
-    *accuracy_data = 0.0f;
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    int num_correct = 0;
-    // assume inference is already the topk of the output
-    for (size_t i = 0; i < num_samples; ++i) {
-      PADDLE_ENFORCE_GE(
-          label_data[i], 0,
-          platform::errors::InvalidArgument(
-              "label of AccuracyOp must >= 0, But received label[%d] is %d", i,
-              label_data[i]));
-      for (size_t j = 0; j < class_dim; ++j) {
-        if (indices_data[i * class_dim + j] == label_data[i]) {
-          ++num_correct;
-          break;
-        }
-      }
-    }
-
-    *correct_data = num_correct;
-    *total_data = num_samples;
-    *accuracy_data =
-        static_cast<float>(num_correct) / static_cast<float>(num_samples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 2598d3b0277c94a52e1fa14b04c00b595071f312..1ce02ff4525c9692f88ed42b79ff336cc0113c41 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index 63bccc2e6e065a639c86a647894d2a0c124f0e54..9f2ca4165f33a28902bfe20207b12bad2af49fad 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -12,8 +12,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index de71312d78df99adc3b3663f2fcbb3943373982e..3cc1be4de8a82ff263824ab4852178f735596d45 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = paddle::framework::Tensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 2a3a0fa5d1fe50c93686c76571d812cab18c1d38..f3ed98c3f4d1e47a8b7dff81a998c7574859baa2 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/auc_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc");
-    auto predict_dims = ctx->GetInputDim("Predict");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_GE(
-        predict_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape size must be "
-            "greater_equal 2.",
-            predict_dims));
-    auto predict_width = predict_dims[1];
-    PADDLE_ENFORCE_NE(
-        phi::product(predict_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape can not involes 0.",
-            predict_dims));
-    PADDLE_ENFORCE_NE(
-        phi::product(label_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Label) has not been initialized properly. The "
-            "shape of Input(Label) = [%s], the shape can not involes 0.",
-            label_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_LE(predict_width, 2,
-                        platform::errors::InvalidArgument(
-                            "Only support binary classification,"
-                            "prediction dims[1] should be 1 or 2"));
-    }
-    auto predict_height = ctx->GetInputDim("Predict")[0];
-    auto label_height = ctx->GetInputDim("Label")[0];
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(predict_height, label_height,
-                        platform::errors::InvalidArgument(
-                            "Out and Label should have same height."));
-    }
-
-    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
-    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
-
-    PADDLE_ENFORCE_GE(
-        num_pred_buckets, 1,
-        platform::errors::InvalidArgument("num_thresholds must larger than 1"));
-    PADDLE_ENFORCE_GE(slide_steps, 0,
-                      platform::errors::InvalidArgument(
-                          "slide_steps must be natural number"));
-
-    ctx->SetOutputDim("AUC", {1});
-
-    if (slide_steps) {
-      ctx->SetOutputDim("StatPosOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-      ctx->SetOutputDim("StatNegOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-    } else {
-      ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets});
-      ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -145,5 +84,7 @@ There are two types of possible curves:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
-REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
+DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor,
+                            PD_INFER_META(phi::AucInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker,
+                             AucInferShapeFunctor);
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
deleted file mode 100644
index 1cb7eba8775e814b1150929de4a341c466ee4583..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/metrics/auc_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
-                                        const int bucket_length,
-                                        const int slide_steps) {
-  int cur_step_index =
-      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
-  int cur_step_begin = cur_step_index * bucket_length;
-  int sum_step_begin = slide_steps * bucket_length;
-  CUDA_KERNEL_LOOP(i, bucket_length) {
-    pos[sum_step_begin + i] -= pos[cur_step_begin + i];
-    neg[sum_step_begin + i] -= neg[cur_step_begin + i];
-    pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0;
-  }
-}
-
-__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg,
-                                    const int bucket_length,
-                                    const int slide_steps) {
-  int cur_step_index =
-      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
-  int cur_step_begin = cur_step_index * bucket_length;
-  int sum_step_begin = slide_steps * bucket_length;
-  CUDA_KERNEL_LOOP(i, bucket_length) {
-    pos[sum_step_begin + i] += pos[cur_step_begin + i];
-    neg[sum_step_begin + i] += neg[cur_step_begin + i];
-  }
-}
-
-template <typename T>
-__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data,
-                              const int inference_width,
-                              const int num_thresholds, int64_t *pos,
-                              int64_t *neg, const int numel,
-                              const int slide_steps) {
-  int cur_step_begin = 0;
-  if (slide_steps > 0) {
-    int cur_step_index =
-        static_cast<int>(pos[(slide_steps + 1) * (1 + num_thresholds)]) %
-        slide_steps;
-    cur_step_begin = cur_step_index * (1 + num_thresholds);
-  }
-  CUDA_KERNEL_LOOP(i, numel) {
-    auto predict_data = pred_data[i * inference_width + (inference_width - 1)];
-    PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1.");
-    PADDLE_ENFORCE(predict_data >= 0,
-                   "The predict data must gather or equal 0.");
-    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-    if (label_data[i]) {
-      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
-    } else {
-      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
-    }
-  }
-}
-__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg,
-                              int num_thresholds, double *auc,
-                              bool need_add_batch_num) {
-  *auc = 0.0f;
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-
-  int idx = num_thresholds;
-
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += stat_pos[idx];
-    totNeg += stat_neg[idx];
-    *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0;
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    *auc = *auc / totPos / totNeg;
-  }
-  if (need_add_batch_num) {
-    stat_pos[num_thresholds + 1] += 1;
-    stat_neg[num_thresholds + 1] += 1;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class AucCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc_tensor = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    auto *auc_value = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    auto *stat_pos_in_tensor = ctx.Input<Tensor>("StatPos");
-    auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
-    auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
-    auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
-#ifdef PADDLE_WITH_CUDA
-    if (stat_pos_in_tensor != stat_pos) {
-      cudaMemcpy(origin_stat_pos, pos_in_data,
-                 ((1 + slide_steps) * (num_thresholds + 1) +
-                  (slide_steps > 0 ? 1 : 0)) *
-                     sizeof(int64_t),
-                 cudaMemcpyDeviceToDevice);
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      cudaMemcpy(origin_stat_neg, neg_in_data,
-                 ((1 + slide_steps) * (num_thresholds + 1) +
-                  (slide_steps > 0 ? 1 : 0)) *
-                     sizeof(int64_t),
-                 cudaMemcpyDeviceToDevice);
-    }
-#else
-    if (stat_pos_in_tensor != stat_pos) {
-      hipMemcpy(origin_stat_pos, pos_in_data,
-                ((1 + slide_steps) * (num_thresholds + 1) +
-                 (slide_steps > 0 ? 1 : 0)) *
-                    sizeof(int64_t),
-                hipMemcpyDeviceToDevice);
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      hipMemcpy(origin_stat_neg, neg_in_data,
-                ((1 + slide_steps) * (num_thresholds + 1) +
-                 (slide_steps > 0 ? 1 : 0)) *
-                    sizeof(int64_t),
-                hipMemcpyDeviceToDevice);
-    }
-#endif
-
-    statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos,
-            origin_stat_neg);
-    int sum_offset = slide_steps * (num_thresholds + 1);
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    CalcAucKernel<<<1, 1, 0, stream>>>(
-        origin_stat_pos + sum_offset, origin_stat_neg + sum_offset,
-        num_thresholds, auc_value, slide_steps > 0);
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos,
-                             int64_t *origin_stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-    const int bucket_length = num_thresholds + 1;
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    if (slide_steps == 0) {
-      AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          label_data, inference_data, inference_width, num_thresholds,
-          origin_stat_pos, origin_stat_neg, batch_size, slide_steps);
-      return;
-    }
-    // the last number of origin_stat_pos store the index should be used in
-    // current step
-    int cur_step_index =
-        static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
-        slide_steps;
-    int cur_step_begin = cur_step_index * bucket_length;
-    int sum_step_begin = slide_steps * bucket_length;
-
-    ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
-                                  PADDLE_CUDA_NUM_THREADS,
-                              PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
-
-    AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
-                        PADDLE_CUDA_NUM_THREADS,
-                    PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        label_data, inference_data, inference_width, num_thresholds,
-        origin_stat_pos, origin_stat_neg, batch_size, slide_steps);
-    UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
-                              PADDLE_CUDA_NUM_THREADS,
-                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(auc,
-                        ops::AucCUDAKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
deleted file mode 100644
index 10403472c69b57723bc714703c115f07d8640f7e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AucKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *predict = ctx.Input<Tensor>("Predict");
-    auto *label = ctx.Input<Tensor>("Label");
-
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    int slide_steps = ctx.Attr<int>("slide_steps");
-
-    // Only use output var for now, make sure it's persistable and
-    // not cleaned up for each batch.
-    auto *auc_tensor = ctx.Output<Tensor>("AUC");
-    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-
-    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    auto *auc_value = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
-    // Just for pass UT, since UT's input & output connot be set same var
-    auto *stat_pos_in_tensor = ctx.Input<Tensor>("StatPos");
-    auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
-    auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
-    auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
-    if (stat_pos_in_tensor != stat_pos) {
-      memcpy(origin_stat_pos, pos_in_data,
-             ((1 + slide_steps) * (num_thresholds + 1) +
-              (slide_steps > 0 ? 1 : 0)) *
-                 sizeof(int64_t));
-    }
-    if (stat_neg_in_tensor != stat_neg) {
-      memcpy(origin_stat_neg, neg_in_data,
-             ((1 + slide_steps) * (num_thresholds + 1) +
-              (slide_steps > 0 ? 1 : 0)) *
-                 sizeof(int64_t));
-    }
-    statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos,
-            origin_stat_neg);
-
-    int sum_offset = slide_steps * (num_thresholds + 1);
-    calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset,
-            num_thresholds, auc_value);
-    if (slide_steps) {
-      origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1;
-      origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1;
-    }
-  }
-
- private:
-  inline static double trapezoidArea(double X1, double X2, double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  inline static void statAuc(const framework::Tensor *label,
-                             const framework::Tensor *predict,
-                             const int num_thresholds, const int slide_steps,
-                             int64_t *origin_stat_pos,
-                             int64_t *origin_stat_neg) {
-    size_t batch_size = predict->dims()[0];
-    size_t inference_width = predict->dims()[1];
-    const T *inference_data = predict->data<T>();
-    const auto *label_data = label->data<int64_t>();
-    const int bucket_length = num_thresholds + 1;
-    if (slide_steps == 0) {
-      for (size_t i = 0; i < batch_size; i++) {
-        // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-        // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-        auto predict_data =
-            inference_data[i * inference_width + (inference_width - 1)];
-        PADDLE_ENFORCE_LE(predict_data, 1,
-                          platform::errors::PreconditionNotMet(
-                              "The predict data must less or equal 1."));
-        PADDLE_ENFORCE_GE(predict_data, 0,
-                          platform::errors::PreconditionNotMet(
-                              "The predict data must gather or equal 0."));
-
-        uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-        if (label_data[i] > 0) {
-          origin_stat_pos[binIdx] += 1;
-        } else if (label_data[i] == 0) {
-          origin_stat_neg[binIdx] += 1;
-        }
-      }
-      return;
-    }
-    // the last number of origin_stat_pos store the index should be used in
-    // current step
-    int cur_step_index =
-        static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
-        slide_steps;
-    int cur_step_begin = cur_step_index * bucket_length;
-    int sum_step_begin = slide_steps * bucket_length;
-    for (int i = 0; i < bucket_length; ++i) {
-      origin_stat_pos[sum_step_begin + i] -=
-          origin_stat_pos[cur_step_begin + i];
-      origin_stat_neg[sum_step_begin + i] -=
-          origin_stat_neg[cur_step_begin + i];
-    }
-
-    std::memset(origin_stat_pos + cur_step_begin, 0,
-                bucket_length * sizeof(int64_t));
-    std::memset(origin_stat_neg + cur_step_begin, 0,
-                bucket_length * sizeof(int64_t));
-
-    for (size_t i = 0; i < batch_size; i++) {
-      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
-      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
-      auto predict_data =
-          inference_data[i * inference_width + (inference_width - 1)];
-      PADDLE_ENFORCE_LE(predict_data, 1,
-                        platform::errors::PreconditionNotMet(
-                            "The predict data must less or equal 1."));
-      PADDLE_ENFORCE_GE(predict_data, 0,
-                        platform::errors::PreconditionNotMet(
-                            "The predict data must gather or equal 0."));
-
-      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
-      if (label_data[i] > 0) {
-        origin_stat_pos[cur_step_begin + binIdx] += 1;
-      } else if (label_data[i] == 0) {
-        origin_stat_neg[cur_step_begin + binIdx] += 1;
-      }
-    }
-    for (int i = 0; i < bucket_length; ++i) {
-      origin_stat_pos[sum_step_begin + i] +=
-          origin_stat_pos[cur_step_begin + i];
-      origin_stat_neg[sum_step_begin + i] +=
-          origin_stat_neg[cur_step_begin + i];
-    }
-  }
-
-  inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg,
-                             int num_thresholds, double *auc) {
-    *auc = 0.0f;
-
-    double totPos = 0.0;
-    double totNeg = 0.0;
-    double totPosPrev = 0.0;
-    double totNegPrev = 0.0;
-
-    int idx = num_thresholds;
-
-    while (idx >= 0) {
-      totPosPrev = totPos;
-      totNegPrev = totNeg;
-      totPos += stat_pos[idx];
-      totNeg += stat_neg[idx];
-      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-      --idx;
-    }
-
-    if (totPos > 0.0 && totNeg > 0.0) {
-      *auc = *auc / totPos / totNeg;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 90e6a36220ab04087cd02abd76f6c3598425573c..812c55cdd5055186d7fd83a2057d88256f3b34a3 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -150,4 +150,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 // TODO(jczaja): Enable FP32 when performance is good
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::LayerNormMKLDNNOpKernel<float>,
                    ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index ab02d4cfed9d54f9d168f6088df3e41d3e3e7c54..1078b451c55bae09c1274fe6ce3f45d21574d5e1 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::DataLayout;
+using framework::Tensor;
 using dnnl::memory;
 using dnnl::pooling_backward;
 using dnnl::pooling_forward;
@@ -83,11 +85,11 @@ class PoolingMKLDNNHandler
         phi::slice_ddim(input_dims, 2, input_dims.size());
 
     if (global_pooling) {
-      operators::UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
-    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                             data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                              data_dims, strides, ksize);
 
     const auto src_tz = phi::vectorize(input->dims());
     const auto dst_tz = phi::vectorize(output->dims());
@@ -173,11 +175,11 @@ class PoolingMKLDNNHandler
     framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
 
     if (global_pooling) {
-      operators::UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
-    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                             data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                              data_dims, strides, ksize);
 
     auto src_tz = phi::vectorize<int64_t>(in_x->dims());
     auto diff_src_tz = phi::vectorize<int64_t>(in_x_grad->dims());
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index 780c6e7f153e7b1179e203bc7807dd7818aa591a..a3b764b0e1c46ab91b989ed7f7b0b5df101f7654 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -13,19 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/shape_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
 
 template <typename T>
-class ShapeMKLDNNKernel : public ShapeKernel<T> {
+class ShapeMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ShapeKernel<T>::Compute(ctx);
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
 
     auto* out = ctx.Output<Tensor>("Out");
     out->set_layout(framework::DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0..23428dd403e9b1ef62007c7b9193ed3b8482cab3 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -29,11 +29,11 @@ USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
 
 namespace paddle {
@@ -55,7 +55,7 @@ class CacheTester {
     onednn_dev_ctx_->ResetBlobMap(nullptr);
   }
 
-  bool Analyze(unsigned short int num_entries) {
+  bool Analyze(uint16_t num_entries) {
     //  Number of created objects in cache should be as expected (num_entries)
     return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries;
   }
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index c776cf2a7c792c429fcf45a367d3f06bf9add5d2..4090d5ffca801512e423b02bfda3dd1a1bc49f03 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -24,14 +24,17 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 
+PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 3791fed23a84ff51d022dd24a6a0734a39636a70..717af61b858dc16f9bdda20f530cbf06a09908eb 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -24,14 +24,18 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-USE_OP(pool2d);
+USE_OP_ITSELF(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
+PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index 884521301750ce92c3f0a2e0b9468c5cc4a57790..6e3bd5e43c9c1d7e5c8a5dd4ba37afcfd7147e20 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MLU);
 
 // relu
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 9de03582cbbf53e843e5f4531a6da6c1c2a87dd5..1fdaa153e3c27ed1a83696bf03d68dbfd2b93ae9 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
+/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx,
+                                  const int pack_num, const int axis,
+                                  const cnnlTensorDescriptor_t inputs_desc[],
+                                  const void* const inputs[],
+                                  const cnnlTensorDescriptor_t output_desc,
+                                  void* output) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc,
+                                        inputs, workspace_ptr, workspace_size,
+                                        output_desc, output));
+}
+
 /* static */ void MLUCnnl::Div(
     const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
     const cnnlTensorDescriptor_t in0_desc, const void* in0,
@@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                        output_descs, output_ptrs));
 }
 
+/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num,
+                                 int axis,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input_ptr,
+                                 const cnnlTensorDescriptor_t output_descs[],
+                                 void* output_ptrs[]) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc,
+                                       input_ptr, workspace_ptr, workspace_size,
+                                       output_descs, output_ptrs));
+}
+
 /* static */ void MLUCnnl::GatherFunctor(
     const ExecutionContext& ctx, const int axis, const int batch_dims,
     const cnnlTensorDescriptor_t params_desc, const void* params,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 2a54a8392c7c5bfbf450ee9351a9fda866a07663..b55b10686e92e2b1b5b3a7390289f8329ac04a04 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -403,6 +403,11 @@ class MLUCnnl {
                      const void* const inputs[],
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num,
+                     const int axis, const cnnlTensorDescriptor_t inputs_desc[],
+                     const void* const inputs[],
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type,
                    const cnnlTensorDescriptor_t input_desc, const void* input,
                    const cnnlTensorDescriptor_t output_desc, void* output);
@@ -566,6 +571,12 @@ class MLUCnnl {
                     const cnnlTensorDescriptor_t output_descs[],
                     void* output_ptrs[]);
 
+  static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis,
+                    const cnnlTensorDescriptor_t input_desc,
+                    const void* input_ptr,
+                    const cnnlTensorDescriptor_t output_descs[],
+                    void* output_ptrs[]);
+
   static void Scale(const ExecutionContext& ctx, const int axis,
                     const cnnlTensorDescriptor_t input_desc, const void* input,
                     const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
index afb949d3374c62f561e910ea77e516bdb4004ac0..2bacda8afb0eb340c4c8d4068f3013e2adbc7f91 100644
--- a/paddle/fluid/operators/mode_op.cu
+++ b/paddle/fluid/operators/mode_op.cu
@@ -24,7 +24,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index fe4609b3ad91e703fc28a997d5505d4cffa001a8..b309e1b87ef9033bd4302cdad4ea60a64cbf02eb 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape(
   return out_dim;
 }
 
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
-                                const framework::Tensor& matrix_a,
-                                const framework::Tensor& matrix_b,
-                                const framework::DDim& a_dim,
-                                const framework::DDim& b_dim) {
-  auto place = ctx.GetPlace();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-  framework::Tensor matrix_c;
-  framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
-  matrix_c.Resize(c_dim);
-  matrix_c.mutable_data<T>(place);
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
-  const T alpha = static_cast<T>(1.0);
-  blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
-  return matrix_c;
-}
-
-/**
- * @brief Recursively calculate matrix multiplication according to the optimal
- * order
- * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
- *
- * @param
- * ins: the input tensors
- * ins_dims: the shape of ins after reshape
- * order: the optimal order
- * i: the left of sub chain
- * j: the righe of sub chain
- * save_result: set true by backward
- * results: save the intermediate result during backward
- */
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatChainMul(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims,
-    const std::vector<uint64_t>& order, const uint64_t i, const uint64_t j,
-    const bool save_result, std::vector<framework::Tensor>* results) {
-  if (i == j) {
-    return *ins[i];
-  }
-
-  const auto A = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, i,
-                                               order[i * ins.size() + j],
-                                               save_result, results);
-  framework::DDim a_dim = A.dims();
-  if (i == order[i * ins.size() + j]) {
-    a_dim = ins_dims[i];
-  }
-
-  const auto B = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order,
-                                               order[i * ins.size() + j] + 1, j,
-                                               save_result, results);
-  framework::DDim b_dim = B.dims();
-  if (j == order[i * ins.size() + j] + 1) {
-    b_dim = ins_dims[j];
-  }
-
-  auto result = MatMul<DeviceContext, T>(ctx, A, B, a_dim, b_dim);
-  if (save_result) {
-    (*results)[i * ins.size() + j] = result;
-  }
-  return result;
-}
-
-/**
- * @brief get the optimal order
- */
-std::vector<uint64_t> GetOrder(const std::vector<const framework::Tensor*>& ins,
-                               const std::vector<framework::DDim>& ins_dims) {
-  auto n = ins.size();
-  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
-  std::vector<uint64_t> p(n + 1);
-  for (uint64_t i = 0; i < n; i++) {
-    p[i] = ins_dims[i][0];
-  }
-  p[n] = ins_dims[n - 1][1];
-
-  // m[i, j]: save the lowest cost for multiplying ins[i...j]
-  std::vector<uint64_t> m(n * n, 0);
-  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
-  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
-  // multiply the resulting matrices is the optimal order for ins[i...j]
-  std::vector<uint64_t> order(n * n);
-  for (uint64_t l = 1; l < n; l++) {
-    for (uint64_t i = 0; i < n - l; i++) {
-      auto j = i + l;
-      m[i * n + j] = 0xffffffff;
-      for (uint64_t k = i; k < j; k++) {
-        uint64_t q =
-            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
-        if (q < m[i * n + j]) {
-          m[i * n + j] = q;
-          order[i * n + j] = k;
-        }
-      }
-    }
-  }
-  return order;
-}
-
-template <typename DeviceContext, typename T>
-static inline framework::Tensor MultiDotMatChainOrder(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims, const bool save_result,
-    std::vector<framework::Tensor>* results) {
-  auto order = GetOrder(ins, ins_dims);
-  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0,
-                                       ins.size() - 1, save_result, results);
-}
-
-inline void GetDims(const std::vector<const framework::Tensor*>& ins,
-                    std::vector<framework::DDim>* ins_dims) {
-  const auto n = ins.size();
-  for (size_t i = 0; i < n; i++) {
-    (*ins_dims)[i] = ins[i]->dims();
-    if (i == 0 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
-    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
-    }
-  }
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel {
   }
 };
 
-/**
- * 1. there are only 2 matrices: direct matrix multiplication A*B
- * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C),
- *  choose the least cost order for calculation
- * 3. more than 3 matrices: call MultiDotMatChainOrder
- */
-template <typename DeviceContext, typename T>
-class MultiDotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto n = ins.size();
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    const T scale = static_cast<T>(1.0);
-    if (n == 2) {
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ma, Nb});
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
-      } else {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ka * Nc * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ka, Nc});
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
-      }
-    } else {
-      std::vector<framework::Tensor> results;
-      const auto tmp = MultiDotMatChainOrder<DeviceContext, T>(
-          ctx, ins, ins_dims, false, &results);
-      auto out_dim = out->dims();
-      *out = tmp;
-      out->Resize(out_dim);
-    }
-  }
-};
-
 class MultiDotOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class MultiDotGradKernel : public framework::OpKernel<T> {
- public:
-  /**
-   * @brief calculate dA and dB
-   * dA = dout * transpose(B)
-   * dB = transpose(A) * dout
-   */
-  void CalcGrad(const framework::ExecutionContext& ctx,
-                const framework::Tensor& dout, const framework::Tensor& A,
-                const framework::Tensor& B, const framework::DDim& dout_dim,
-                const framework::DDim& a_dim, const framework::DDim& b_dim,
-                framework::Tensor* dA, framework::Tensor* dB) const {
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
-    T alpha = static_cast<T>(1.0);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
-    blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
-  }
-
-  /**
-   * @brief calculate multi matrix multiplication grad by a chain order
-   * @param
-   * dout: the grad of multi matrix multiplication out
-   * dx: the out grad of inputs
-   * ins: the input tensors
-   * ins_dims: the shape of ins after reshape
-   * order: the optimal order
-   * i: the left of sub chain
-   * j: the righe of sub chain
-   * results: the intermediate result of farward
-   */
-  void MatChainMulGrad(const framework::ExecutionContext& ctx,
-                       const framework::Tensor& dout,
-                       std::vector<framework::Tensor*>* dx,
-                       const std::vector<const framework::Tensor*>& ins,
-                       const framework::DDim& dout_dim,
-                       const std::vector<framework::DDim>& ins_dims,
-                       const std::vector<uint64_t>& order, const uint64_t i,
-                       const uint64_t j,
-                       const std::vector<framework::Tensor>& results) const {
-    if (i == j) {
-      *((*dx)[i]) = dout;
-      return;
-    }
-
-    const auto n = ins.size();
-    const auto right = order[i * n + j];
-    const auto left = order[i * n + j] + 1;
-    // get the multi result of left sub chain
-    const auto* A = &results[i * n + right];
-    framework::DDim a_dim = A->dims();
-    if (i == right) {
-      A = ins[i];
-      a_dim = ins_dims[i];
-    }
-    // get the multi result of right sub chain
-    const auto* B = &results[left * n + j];
-    framework::DDim b_dim = B->dims();
-    if (left == j) {
-      B = ins[j];
-      b_dim = ins_dims[j];
-    }
-    framework::Tensor dA, dB;
-    dA.Resize({dout_dim[0], b_dim[0]});
-    dB.Resize({a_dim[1], dout_dim[1]});
-    dA.mutable_data<T>(ctx.GetPlace());
-    dB.mutable_data<T>(ctx.GetPlace());
-
-    CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
-    MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right,
-                    results);
-    MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j,
-                    results);
-  }
-
-  void MultiDotGradMatChainOrder(
-      const framework::ExecutionContext& ctx, const framework::Tensor& dout,
-      const std::vector<const framework::Tensor*>& ins,
-      const framework::DDim& dout_dim,
-      const std::vector<framework::DDim>& ins_dims,
-      std::vector<framework::Tensor*>* dx) const {
-    auto order = GetOrder(ins, ins_dims);
-    auto n = ins.size();
-    std::vector<framework::Tensor> results(n * n);
-    MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1, true,
-                                  &results);
-    MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1,
-                    results);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    auto place = ctx.GetPlace();
-
-    const auto n = ins.size();
-    for (size_t i = 0; i < n; i++) {
-      dx[i]->mutable_data<T>(place);
-    }
-
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    framework::DDim dout_dim = dout.dims();
-    if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
-      dout_dim = phi::make_ddim({1, 1});
-    } else if (ins[0]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({1, dout_dim[0]});
-      }
-    } else if (ins[n - 1]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({dout_dim[0], 1});
-      }
-    }
-
-    T alpha = static_cast<T>(1);
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    if (n == 2) {
-      CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
-               dx[0], dx[1]);
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ma, Nb});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({mat_dim_dout.height_, Nb});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(),
-                 ins_dims[2], &tmp_dout, dx[2]);
-        CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0],
-                 ins_dims[1], dx[0], dx[1]);
-      } else {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ka, Nc});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({Ka, mat_dim_dout.width_});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0],
-                 tmp_dout.dims(), dx[0], &tmp_dout);
-        CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1],
-                 ins_dims[2], dx[1], dx[2]);
-      }
-    } else {
-      MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx);
-      if (ins[n - 1]->dims().size() == 1) {
-        dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
-      }
-    }
-  }
-};
-
 template <typename T>
 class MultiDotOpGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, float>);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 1143f9cb37aa54bea430d3a8bca8b62b02da4e2b..0113f638b9a47d161c890a0f547f8680af4018e7 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
-                            PT_INFER_META(phi::MultinomialInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
+                            PD_INFER_META(phi::MultinomialInferMeta));
 REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index ab9f10070fc60deab8974ae0e81e2b4c6cef2ffd..bf7222fc45c66085473eae627abe97b8a41d4268 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
-    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
-
-    auto dim_x = context->GetInputDim("X");
-    auto dim_vec = context->GetInputDim("Vec");
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of input X should be 2, but is %d", dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_vec.size(), 1,
-        platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
-                      platform::errors::InvalidArgument(
-                          "X's second dimension is expected to be equal to "
-                          "Vec's first dimension"
-                          "but recieved X'shape = [%s], Vec's shape = [%s]",
-                          dim_x, dim_vec));
-
-    framework::DDim dim_out = phi::make_ddim({dim_x[0]});
-
-    context->SetOutputDim("Out", dim_out);
-    context->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 template <typename T>
@@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor,
+                            PD_INFER_META(phi::MvInferMeta));
+
 REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>,
+                  MvInferShapeFunctor);
 REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index f510c7bebec876d034c1af923a4f7077c096000c..a4e1f7b3091a9f692e479300310333bfdd359096 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight",
-                   "NLLLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true,
-                      platform::errors::InvalidArgument(
-                          "The tensor rank of Input(X) must be 2 or 4."));
-    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) ||
-                               phi::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0], label_dims[0],
-          platform::errors::InvalidArgument(
-              "ShapeError: Expected input batch_size to match label batch_size,"
-              "But received: the Input(x) batch_size is [%s], the Input(label) "
-              " batch_size is [%s].",
-              x_dims[0], label_dims[0]));
-      if (ctx->HasInput("Weight")) {
-        auto w_dims = ctx->GetInputDim("Weight");
-        PADDLE_ENFORCE_EQ(w_dims.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "Input(Weight) should be a 1D tensor."));
-        PADDLE_ENFORCE_EQ(
-            x_dims[1], w_dims[0],
-            platform::errors::InvalidArgument(
-                "Expected input tensor Weight's size should equal "
-                "to the first dimension of the input tensor X. But received "
-                "Weight's "
-                "size is %d, the first dimension of input X is %d",
-                w_dims[0], x_dims[1]));
-      }
-    }
-    if (x_dims.size() == 2) {
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    } else if (x_dims.size() == 4) {
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        platform::errors::InvalidArgument(
-                            "Expected Input(Lable) dimensions=3, received %d.",
-                            label_dims.size()));
-      auto input0 = x_dims[0];
-      auto input2 = x_dims[2];
-      auto input3 = x_dims[3];
-      auto label0 = label_dims[0];
-      auto label1 = label_dims[1];
-      auto label2 = label_dims[2];
-      PADDLE_ENFORCE_EQ(
-          input0 == label0 && input2 == label1 && input3 == label2, true,
-          platform::errors::InvalidArgument("Input(X) tensor shape should "
-                                            "match to Input(Label) tensor "
-                                            "shape."));
-      if (reduction == "none") {
-        ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]});
-      } else {
-        ctx->SetOutputDim("Out", {1});
-      }
-    }
-    ctx->SetOutputDim("Total_weight", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -259,15 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor,
+                            PD_INFER_META(phi::NllLossRawInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker,
                   ops::NLLLossGradMaker<paddle::framework::OpDesc>,
-                  ops::NLLLossGradMaker<paddle::imperative::OpBase>);
+                  ops::NLLLossGradMaker<paddle::imperative::OpBase>,
+                  NllLossRawInferShapeFunctor);
 REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss, ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
deleted file mode 100644
index be6f4422d4ac6a475477c025c4b76eabdbf4f9e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/nll_loss_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int64_t i = 0; i < batch_size; ++i) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        out_data[i] = 0;
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                        platform::errors::InvalidArgument(
-                            "Label value is out of range. "
-                            "Expected label value in range of [0, %d), but "
-                            "received value is %d.",
-                            n_classes, cur_label));
-
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int64_t i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      out_data[i] = 0;
-      continue;
-    }
-    PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                      platform::errors::InvalidArgument(
-                          "label should not be out of bounds."));
-
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    total_weight_val += cur_weight;
-    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
-  }
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename T>
-static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const int64_t in_dim2, const int64_t in_dim3,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            out_data[index] = 0;
-            continue;
-          }
-          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                            platform::errors::InvalidArgument(
-                                "label should not be out of bounds."));
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
-                                    h * in_dim3 + w] *
-                            cur_weight;
-        }
-      }
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          out_data[index] = 0;
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                          platform::errors::InvalidArgument(
-                              "label should not be out of bounds."));
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        total_weight_val += cur_weight;
-        output_val -=
-            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
-            cur_weight;
-      }
-    }
-  }
-
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    *total_weight_data = 0;
-
-    auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_1D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, reduction,
-                     ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_2D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, in_dim2, in_dim3,
-                     reduction, ignore_index);
-    }
-  }
-};
-
-template <typename T>
-static void nll_loss_grad_1D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        continue;
-      }
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      continue;
-    }
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
-    if (reduction == "mean") {
-      dx_data[i * n_classes + cur_label] /= total_weight_val;
-    }
-  }
-}
-
-template <typename T>
-static void nll_loss_grad_2D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const int64_t in_dim2, const int64_t in_dim3,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            continue;
-          }
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
-              -cur_weight * dout_data[index];
-        }
-      }
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          continue;
-        }
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        const auto dx_index =
-            i * sample_size + cur_label * map_size + h * in_dim3 + w;
-        dx_data[dx_index] = -dout_val * cur_weight;
-        if (reduction == "mean") {
-          dx_data[dx_index] /= total_weight_val;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-
-    const auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, reduction,
-                       ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, in_dim2,
-                       in_dim3, reduction, ignore_index);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index c400a8f4239a605414bf0d99a6a89b0ddae6c535..0ed1f2719de25bd2c138c23dd69b914a66961464 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
 }
 
 template <typename DeviceContext, typename T>
-void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+void NormDoubleGradFunctor(const DeviceContext &ctx,
                            const DataLayout data_layout, const Tensor *X,
                            const Tensor *Scale, const Tensor *dY,
                            const Tensor *Saved_mean,
-                           const Tensor *Saved_variance, const double epsilon,
+                           const Tensor *Saved_variance, const Tensor *Mean,
+                           const Tensor *Variance, const double epsilon,
                            const bool use_global_stats, const Tensor *ddX,
                            const Tensor *ddScale, const Tensor *ddBias,
                            Tensor *dX, Tensor *dScale, Tensor *ddY) {
@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
   const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
 
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  phi::funcs::SetConstant<DeviceContext, T> set_constant;
 
   auto &x_dims = X->dims();
   const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   Tensor scale_tmp;
   if (!Scale) {
     scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
 #ifdef __HIPCC__
@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 #else
   const int block = 512;
 #endif
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
   int grid1 = (num + block - 1) / block;
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean = Mean;
+    const auto *running_var = Variance;
     const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
     mean_data = running_mean_data;
@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   } else {
     const T *smean_data = Saved_mean->data<T>();
     const T *svariance_data = Saved_variance->data<T>();
+
     mean_data = smean_data;
     variance_data = svariance_data;
   }
 
   if (dX) {
     T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dX, static_cast<T>(0));
+    set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       } else {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       }
@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (dScale) {
     T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       } else {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       } else {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       }
@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (ddY) {
     T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       } else {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       }
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index e212f4e7e2b7d1ad7964cc9351f1c4e241d5a79e..122b6a8a80aac95ab98ad95ed3e6339684978d12 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -12,9 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,26 +26,6 @@ namespace operators {
 class OneHotV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 1."));
-
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    auto out_dims_vec = phi::vectorize(x_dims);
-    out_dims_vec.push_back(depth);
-    auto out_dims = phi::make_ddim(out_dims_vec);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
+      const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
@@ -114,10 +98,12 @@ Out is a LoDTensor:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor,
+                            PD_INFER_META(phi::OneHotRawInferMeta));
+
 REGISTER_OPERATOR(
     one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    one_hot_v2, ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    OneHotInferShapeFunctor);
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
deleted file mode 100644
index 77e2a931e50de5b7775463fc7bbf6262e2ad4a53..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotV2OpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotV2OpCUDAFunctor(const framework::LoDTensor* in,
-                        framework::LoDTensor* out, int depth,
-                        const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    phi::funcs::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(),
-                                          &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot_v2,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index acf6baf50b418ae0fd68d64f52f80f47df1c60c3..e5702a37bb2b4a4180e209bb5e306be64830bd99 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class OneHotV2NPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index b96fcaa486cce8099cf1d03c7d948ea74c1923ad..372a71706ab5ec72b6da4cbac1b63333f42cb265 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -17,8 +17,10 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add_grad);
+PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index ad7f93d73e902bbac684832d3a77ba83b517daf6..315831ddc0f290cc8c7ad1b78ce8625722f91d3b 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Param) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Grad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredGrad"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredUpdate"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("ParamOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(ParamOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredGradOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredUpdateOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and grad input of AdadeltaOp should have same dimension."));
-    PADDLE_ENFORCE_NE(
-        phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-        platform::errors::InvalidArgument(
-            "Maybe the Input variable AvgSquaredGrad has not "
-            "been initialized. You may need to confirm if you put "
-            "exe.run(startup_program) after optimizer.minimize "
-            "function."));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredGrad input of AdadeltaOp "
-                          "should have same dimension"));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredUpdate input of AdadeltaOp "
-                          "should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -149,7 +81,11 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor,
+                            PD_INFER_META(phi::AdadeltaInferMeta));
+REGISTER_OPERATOR(
+    adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdadeltaInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
deleted file mode 100644
index 85cfad35858bbe6b112169f196c0711d981e9446..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdadeltaOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto avg_squared_grad_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
-    auto avg_squared_update_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T rho = static_cast<T>(ctx.Attr<float>("rho"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    // Squared gradient accumulator
-    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
-    // Squared updates accumulator
-    auto avg_squared_update = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto avg_squared_grad_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
-    auto avg_squared_update_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    avg_squared_grad_out.device(place) =
-        rho * avg_squared_grad + (1 - rho) * grad.square();
-    auto update =
-        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
-             .sqrt() *
-        grad;
-    avg_squared_update_out.device(place) =
-        rho * avg_squared_update + (1 - rho) * update.square();
-    param_out.device(place) = param + update;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index a95a37c980c8c9d41dc9fd352e3dace787a7c4e9..036839dd1300feac544a6f1ca661598f4360f745 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut",
-                   "Adamax");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 dimension"));
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Beta1 power accumulator should have 1 dimension"));
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and Grad input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        platform::errors::InvalidArgument(
-            "Param and Moment input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("InfNorm"),
-        platform::errors::InvalidArgument(
-            "Param and InfNorm input of AdamaxOp should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-    ctx->SetOutputDim("InfNormOut", param_dims);
-  }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -150,7 +92,11 @@ division by 0 error.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor,
+                            PD_INFER_META(phi::AdamaxInferMeta));
+
+REGISTER_OPERATOR(
+    adamax, ops::AdamaxOp, ops::AdamaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdamaxInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
deleted file mode 100644
index df0112448b1cbc82d699dc1ee6f3444bda3b142b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamaxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto inf_norm = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("InfNorm"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto inf_norm_out =
-        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(*place) =
-        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    auto lr_t = lr / (1 - beta1_pow);
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(*place) =
-        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index ab8b4f2b8f4d37d4be62c5e1dd040a1461d0bdee..a3fbb0e59e24e9be67da5048ebc644f08b385bbf 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx,
   PADDLE_ENFORCE_NE(
       static_cast<const void *>(x), static_cast<void *>(y),
       platform::errors::InvalidArgument("Inplace cast is not supported yet."));
-  int vec_size =
-      std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y));
+  int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
   switch (vec_size) {
     case 4:
       return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 8bb4606ffff151c6f65606d8dce156f98589a6b4..5b60f65442b55dc89a845859f153048e89704f70 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -19,11 +19,11 @@
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
 #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
 #include "paddle/fluid/operators/tensor_to_string.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -66,8 +66,8 @@ struct L2NormFunctor {
     int i;
     for (i = threadIdx.x * VecSize; i + VecSize <= size;
          i += (BlockDim * VecSize)) {
-      platform::AlignedVector<T, VecSize> tmp_vec;
-      platform::Load(ptr + i, &tmp_vec);
+      phi::AlignedVector<T, VecSize> tmp_vec;
+      phi::Load(ptr + i, &tmp_vec);
 #pragma unroll
       for (int j = 0; j < VecSize; ++j) {
         auto tmp = static_cast<MT>(tmp_vec[j]);
@@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   constexpr int max_load_bits = 128;
   int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
   auto address = reinterpret_cast<uintptr_t>(ptr);
-  constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
-  constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
-  constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  constexpr int vec8 = alignof(phi::AlignedVector<T, 8>);
+  constexpr int vec4 = alignof(phi::AlignedVector<T, 4>);
+  constexpr int vec2 = alignof(phi::AlignedVector<T, 2>);
   chunk_size *= sizeof(T);
   if (address % vec8 == 0 && chunk_size % vec8 == 0) {
     return std::min(8, valid_vec_size);
@@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
   int stride = blockDim.x * gridDim.x * VecSize;
 
   for (; i + VecSize <= num; i += stride) {
-    platform::AlignedVector<T1, VecSize> x_vec;
-    platform::AlignedVector<T1, VecSize> y_vec;
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T1, VecSize> y_vec;
 
-    platform::Load(x + i, &x_vec);
+    phi::Load(x + i, &x_vec);
 #pragma unroll
     for (int j = 0; j < VecSize; ++j) {
       y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
     }
-    platform::Store(y_vec, y + i);
+    phi::Store(y_vec, y + i);
   }
 
   for (; i < num; ++i) {
@@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
   int stride = blockDim.x * gridDim.x * VecSize;
 
   for (; i + VecSize <= num; i += stride) {
-    platform::AlignedVector<T, VecSize> param_vec;
-    platform::AlignedVector<GradT, VecSize> grad_vec;
-    platform::AlignedVector<T, VecSize> mom1_vec;
-    platform::AlignedVector<T, VecSize> mom2_vec;
-    platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
+    phi::AlignedVector<T, VecSize> param_vec;
+    phi::AlignedVector<GradT, VecSize> grad_vec;
+    phi::AlignedVector<T, VecSize> mom1_vec;
+    phi::AlignedVector<T, VecSize> mom2_vec;
+    phi::AlignedVector<T, VecSize> trust_ratio_div_vec;
 
     T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
     if (cur_weight_decay != static_cast<T>(0.0)) {
-      platform::Load(param_p + i, &param_vec);
+      phi::Load(param_p + i, &param_vec);
     } else {
 #pragma unroll
       for (int j = 0; j < VecSize; ++j) {
         param_vec[j] = static_cast<T>(0);
       }
     }
-    platform::Load(grad_p + i, &grad_vec);
-    platform::Load(mom1_p + i, &mom1_vec);
-    platform::Load(mom2_p + i, &mom2_vec);
+    phi::Load(grad_p + i, &grad_vec);
+    phi::Load(mom1_p + i, &mom1_vec);
+    phi::Load(mom2_p + i, &mom2_vec);
 
 #define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2,    \
                                            __trust_ratio_div, __idx)           \
@@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
                                          mom2_vec, trust_ratio_div_vec, j);
     }
 
-    platform::Store(mom1_vec, mom1_p + i);
-    platform::Store(mom2_vec, mom2_p + i);
-    platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
+    phi::Store(mom1_vec, mom1_p + i);
+    phi::Store(mom2_vec, mom2_p + i);
+    phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
   }
 
   for (; i < num; ++i) {
@@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor {
     trust_ratio_div += offset;
 
     for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
-      platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
-      platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
+      phi::AlignedVector<MT, VecSize> trust_ratio_div_vec;
+      phi::Load(trust_ratio_div + i, &trust_ratio_div_vec);
       if (HasMasterParam) {
-        platform::AlignedVector<MT, VecSize> master_param_vec;
-        platform::Load(master_param + i, &master_param_vec);
-        platform::AlignedVector<ParamT, VecSize> param_vec;
+        phi::AlignedVector<MT, VecSize> master_param_vec;
+        phi::Load(master_param + i, &master_param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
 #pragma unroll
         for (int j = 0; j < VecSize; ++j) {
           MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
           master_param_vec[j] = p;
           param_vec[j] = static_cast<ParamT>(p);
         }
-        platform::Store(master_param_vec, master_param + i);
-        platform::Store(param_vec, param + i);
+        phi::Store(master_param_vec, master_param + i);
+        phi::Store(param_vec, param + i);
       } else {
-        platform::AlignedVector<ParamT, VecSize> param_vec;
-        platform::Load(param + i, &param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
+        phi::Load(param + i, &param_vec);
 #pragma unroll
         for (int j = 0; j < VecSize; ++j) {
           MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
           param_vec[j] = static_cast<ParamT>(p);
         }
-        platform::Store(param_vec, param + i);
+        phi::Store(param_vec, param + i);
       }
     }
 
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index df5da1b79535cc6f5e4a638e9d32c367ea7cdb9f..fe5cd066864b82c734614e33869dff1734bee6d0 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate(
     T* param_out, MT* velocity_out, const MT mu, MT local_lr,
     const MT lars_weight_decay, const MT rescale_grad, const int tid,
     const int grid_stride, const int numel, MT* master_param_out = nullptr) {
-  using VecType = paddle::platform::AlignedVector<T, VecSize>;
-  using VecMType = paddle::platform::AlignedVector<MT, VecSize>;
+  using VecType = phi::AlignedVector<T, VecSize>;
+  using VecMType = phi::AlignedVector<MT, VecSize>;
   int main = numel >> (VecSize >> 1);
   int tail_offset = main * VecSize;
 
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5399ee36ba7ff4a983448d607c108db8870138c
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto params = ctx.MultiInput<framework::Tensor>("Param");
+    auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    size_t n = params.size();
+    PADDLE_ENFORCE_EQ(n, params_out.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          params_out.size(), n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(params[i], params_out[i],
+                        platform::errors::InvalidArgument(
+                            "The size of Input(Param) and Output(ParamOut) "
+                            "must be the same Tensors."));
+    }
+
+    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    PADDLE_ENFORCE_EQ(
+        n, grads.size(),
+        platform::errors::InvalidArgument(
+            "The size of Input(Grad) must be equal to Input(Param), but got "
+            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
+            grads.size(), n));
+
+    auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
+    PADDLE_ENFORCE_EQ(n, velocitys.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Velocity) must be equal to "
+                          "Input(Param), but got the size of Input(Velocity) "
+                          "is %d, the size of Input(Param) is %d.",
+                          velocitys.size(), n));
+
+    auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
+    PADDLE_ENFORCE_EQ(
+        n, velocitys_out.size(),
+        platform::errors::InvalidArgument(
+            "The size of Output(VelocityOut) must be "
+            "equal to Input(Param), but got the size of Output(VelocityOut) is "
+            "%d, the size of Input(Param) is %d.",
+            velocitys_out.size(), n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i],
+                        platform::errors::InvalidArgument(
+                            "Input(Velocity) and Output(VelocityOut) must be "
+                            "the same Tensors."));
+    }
+
+    auto mu = ctx.Attr<float>("mu");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    if (lrs.size() != 1) {
+      PADDLE_ENFORCE_EQ(
+          n, lrs.size(),
+          platform::errors::InvalidArgument(
+              "If the size of Input(LearningRate) is not 1, the size of "
+              "Input(LearningRate) must be "
+              "equal to Input(Param), but got the size of Input(LearningRate) "
+              "is %d, the size of Input(Param) is %d.",
+              lrs.size(), n));
+    }
+    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto regularization_methods =
+        ctx.Attr<std::vector<std::string>>("regularization_method");
+    auto regularization_coeffs =
+        ctx.Attr<std::vector<float>>("regularization_coeff");
+    if (regularization_methods.size() != 0) {
+      PADDLE_ENFORCE_EQ(
+          n, regularization_methods.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_method) must be equal "
+              "to Input(Param), but got the size of "
+              "Attr(regularization_method) is %d, the size of Input(Param) is "
+              "%d.",
+              regularization_methods.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, regularization_coeffs.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_coeff) must be equal "
+              "to Input(Param), but got the size of Attr(regularization_coeff) "
+              "is %d, the size of Input(Param) is %d.",
+              regularization_coeffs.size(), n));
+    }
+
+    VLOG(5) << "use_nesterov: " << use_nesterov
+            << ",  regularization_methods.size(): "
+            << regularization_methods.size()
+            << ",  regularization_coeffs.size(): "
+            << regularization_coeffs.size();
+
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+
+    Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
+    MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
+
+    for (size_t idx = 0; idx < n; ++idx) {
+      RegularizationType regularization_flag =
+          regularization_methods.size() > 0 &&
+                  regularization_methods[idx] == "l2_decay"
+              ? RegularizationType::kL2DECAY
+              : RegularizationType::kNONE;
+      T regularization_coeff = static_cast<T>(0.0);
+      if (regularization_coeffs.size() != 0) {
+        regularization_coeff = static_cast<T>(regularization_coeffs[idx]);
+      }
+
+      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
+      auto param_out = params_out[idx];
+      auto velocity_out = velocitys_out[idx];
+
+      auto grad = grads[idx];
+      Tensor regularized_grad;
+      MLUCnnlTensorDesc param_desc(*param_out);
+      if (regularization_flag == RegularizationType::kL2DECAY) {
+        regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
+            param_out->dims(), dev_ctx);
+        MLUCnnlOpTensorDesc op_tensor_desc(
+            CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+        MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(),
+                          GetBasePtr(param_out), param_desc.get(),
+                          GetBasePtr(grad), param_desc.get(),
+                          GetBasePtr(&regularized_grad), ToCnnlDataType<T>(),
+                          regularization_coeff);
+      } else {
+        regularized_grad = *grad;
+      }
+      MLUCnnl::ApplyMomentum(ctx, param_desc.get(),
+                             GetBasePtr(&regularized_grad), use_nesterov,
+                             GetBasePtr(learning_rate), GetBasePtr(&mu_tensor),
+                             GetBasePtr(param_out), GetBasePtr(velocity_out));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel<float>,
+                       ops::MLUMergedMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 5df167fdf726345074cdc40afd0c5b394467578f..0aedd800e1a237d4baf0092eef9bac9f7dbe862d 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/padding.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
       pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
     }
 
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
-                                           *in_y, out);
+    phi::funcs::PaddingFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, pad_value,
+        *in_y, out);
   }
 };
 
@@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
       pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
     }
 
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
-                                               d_y);
+    phi::funcs::PaddingGradFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, *in_dout,
+        d_y);
   }
 };
 
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 39acba7e58aba51942d7d8de2d89e2783fd591f9..dc162ae5782f2690fcf6378603268369e4aeb9ca 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pad_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad");
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    PADDLE_ENFORCE_EQ(
-        static_cast<int>(paddings.size()), x_dim.size() * 2,
-        platform::errors::InvalidArgument(
-            "Size of 'paddings' dimension should be equal to 2 * size of "
-            "Input(X)'s dimension, but received (size of 'paddings' dimension "
-            "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
-            static_cast<int>(paddings.size()), x_dim.size() * 2));
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      PADDLE_ENFORCE_GE(paddings[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The element of 'paddings' should >= 0, but "
-                            "received %d for index %d.",
-                            paddings[i], static_cast<int>(i)));
-    }
-    std::vector<int64_t> out_dims(x_dim.size());
-    for (int i = 0; i < x_dim.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
-        out_dims[i] = -1;
-      } else {
-        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
   }
 };
 
@@ -160,47 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor,
+                            PD_INFER_META(phi::PadInferMeta));
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
                   ops::PadOpGradMaker<paddle::framework::OpDesc>,
-                  ops::PadOpGradMaker<paddle::imperative::OpBase>);
+                  ops::PadOpGradMaker<paddle::imperative::OpBase>,
+                  PadInferShapeFunctor);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
                   ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
deleted file mode 100644
index d494c954e1ef73b585761acf7490a5e35beccac4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/padding.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class PadKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    float pad_value = context.Attr<float>("pad_value");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    int rank = x->dims().size();
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
-                                           static_cast<T>(pad_value), *x, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PadGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    if (d_x == nullptr) {
-      return;
-    }
-
-    d_x->mutable_data<T>(context.GetPlace());
-    int rank = d_out->dims().size();
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
-                                               d_x);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 2a127d9ad1db0c1e169fdd1e20a1568b99d228a0..21ca26f49f653d03e2710937d360091e0c4536df 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
-                            PT_INFER_META(phi::PixelShuffleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
+                            PD_INFER_META(phi::PixelShuffleInferMeta));
 
 REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index 0cecbf0b9cb027f7032b7b20fb10ef06a79503df..d5896c4105932ef7327d7093a15cf50e87308ae5 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
                   ops::PoissonOpInferVarType,
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
deleted file mode 100644
index 6335004e69a37109664940e4d3445e3694be9cc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ /dev/null
@@ -1,567 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pool_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/operator.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using DataLayout = platform::DataLayout;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-DataLayout getLayoutFromStr(std::string data_format) {
-  if (data_format == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (data_format == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (data_format == "NCDHW") {
-    return DataLayout::kNCDHW;
-  } else {
-    return DataLayout::kNCDHW;
-  }
-}
-
-template <typename T>
-class PoolCUDNNOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
-                                          "CUDAPlace rather than CPUPlace."));
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    Tensor *output = ctx.Output<Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = input->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
-    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
-
-    // -----------------transformed tensor ------------------------
-
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-    DataLayout layout;
-
-    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCDHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 4, 1, 2, 3};
-
-      // input
-      transformed_input.Resize(input->dims());
-
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[4];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      in_dims_vec[4] = input->dims()[3];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
-      trans5(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[4];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      out_dims_vec[4] = output->dims()[3];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-    } else if (data_format == str_NHWC) {
-      layout = DataLayout::kNCHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 3, 1, 2};
-
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[3];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
-      trans(dev_ctx, *input, &transformed_input, axis);
-
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[3];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-#endif
-    } else {
-      layout = getLayoutFromStr(data_format);
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-
-    const T *tranformed_input_data = transformed_input.data<T>();
-    T *tranformed_output_data = transformed_output.mutable_data<T>(
-        transformed_output.dims(), ctx.GetPlace());
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#else
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#endif
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#else
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#endif
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-
-#ifdef PADDLE_WITH_HIP
-    char *pool_workspace;
-    size_t pool_worksize = 0;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
-            cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data,
-        false, pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        tranformed_input_data, &beta, cudnn_output_desc,
-        tranformed_output_data));
-#endif
-    // add
-    if (data_format == str_NDHWC) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 2, 3, 4, 1};
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v2;
-      trans5_v2(dev_ctx, transformed_output, output, axis);
-    }
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN not support NHWC data layout
-    if (data_format == str_NHWC) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 2, 3, 1};
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
-      trans(dev_ctx, transformed_output, output, axis);
-    }
-#endif
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
-                                          "CUDAPlace rather than CPUPlace."));
-
-    const Tensor *input = ctx.Input<Tensor>("X");
-    const Tensor *output = ctx.Input<Tensor>("Out");
-    const Tensor *output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-#ifdef PADDLE_WITH_HIP
-    if (pooling_type == "max") {
-      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
-      using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
-      auto &all_op_kernels =
-          paddle::framework::OperatorWithKernel::AllOpKernels();
-      std::string op_type = "pool2d_grad";
-      auto kernels_iter = all_op_kernels.find(op_type);
-      PADDLE_ENFORCE_NE(
-          kernels_iter, all_op_kernels.end(),
-          platform::errors::Unavailable(
-              "There are no kernels which are registered in the %s operator.",
-              op_type));
-      OpKernelMap &kernels = kernels_iter->second;
-      paddle::framework::OpKernelType expected_kernel_key(
-          paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
-      auto kernel_iter = kernels.find(expected_kernel_key);
-      PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                        platform::errors::NotFound(
-                            "Operator (%s) does not have kernel for %s.",
-                            op_type, KernelTypeToString(expected_kernel_key)));
-      std::unique_ptr<OpKernelFunc> kernel_func_(
-          new OpKernelFunc(kernel_iter->second));
-      (*kernel_func_)(ctx);
-      return;
-    }
-#endif
-
-    // update paddings
-    auto in_x_dims = input->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    // ------- tensor grad --------------
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_output_grad(output_grad->type());
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    Tensor transformed_input_grad(input_grad->type());
-    DataLayout layout;
-    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
-    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
-    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCDHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 4, 1, 2, 3};
-
-      // input
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[4];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      in_dims_vec[4] = input->dims()[3];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
-      trans5(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[4];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      out_dims_vec[4] = output->dims()[3];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-
-      transformed_output.mutable_data(ctx.GetPlace(), output->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v2;
-      trans5_v2(dev_ctx, *output, &transformed_output, axis);
-
-      // output grad
-      transformed_output_grad.Resize(phi::make_ddim(out_dims_vec));
-      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-          trans5_v3;
-      trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
-
-      // input grad
-      transformed_input_grad.Resize(phi::make_ddim(in_dims_vec));
-
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-    } else if (data_format == str_NHWC) {
-      layout = DataLayout::kNCHW;
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      std::vector<int> axis{0, 3, 1, 2};
-
-      // input
-      transformed_input.Resize(input->dims());
-      auto in_dims_vec = phi::vectorize(input->dims());
-      in_dims_vec[1] = input->dims()[3];
-      in_dims_vec[2] = input->dims()[1];
-      in_dims_vec[3] = input->dims()[2];
-      transformed_input.Resize(phi::make_ddim(in_dims_vec));
-      transformed_input.mutable_data(ctx.GetPlace(), input->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
-      trans4(dev_ctx, *input, &transformed_input, axis);
-
-      // output
-      transformed_output.Resize(output->dims());
-      auto out_dims_vec = phi::vectorize(output->dims());
-      out_dims_vec[1] = output->dims()[3];
-      out_dims_vec[2] = output->dims()[1];
-      out_dims_vec[3] = output->dims()[2];
-      transformed_output.Resize(phi::make_ddim(out_dims_vec));
-
-      transformed_output.mutable_data(ctx.GetPlace(), output->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-          trans4_v2;
-      trans4_v2(dev_ctx, *output, &transformed_output, axis);
-
-      // output grad
-      transformed_output_grad.Resize(phi::make_ddim(out_dims_vec));
-      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
-
-      phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-          trans4_v3;
-      trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
-
-      // input grad
-      transformed_input_grad.Resize(phi::make_ddim(in_dims_vec));
-#endif
-    } else {
-      layout = getLayoutFromStr(data_format);
-      transformed_input = *input;
-      transformed_output = *output;
-      transformed_output_grad = *output_grad;
-      transformed_input_grad = *input_grad;
-    }
-
-    const T *input_data = transformed_input.data<T>();
-    const T *output_data = transformed_output.data<T>();
-    const T *output_grad_data = transformed_output_grad.data<T>();
-
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#else
-    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_input.dims()));
-    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, phi::vectorize<int>(transformed_output.dims()));
-#endif
-    PoolingMode pooling_mode;
-    if (pooling_type == "max") {
-      if (FLAGS_cudnn_deterministic) {
-        pooling_mode = PoolingMode::kMaximumDeterministic;
-      } else {
-        pooling_mode = PoolingMode::kMaximum;
-      }
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : PoolingMode::kAverageInclusive;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#else
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
-#endif
-
-    // ------------------- cudnn pool algorithm ---------------------
-    auto handle = ctx.cuda_device_context().cudnn_handle();
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    if (input_grad) {
-      T *input_grad_data = transformed_input_grad.mutable_data<T>(
-          transformed_input_grad.dims(), ctx.GetPlace());
-// Because beta is zero, it is unnecessary to reset input_grad.
-#ifdef PADDLE_WITH_HIP
-      char *pool_workspace;
-      size_t pool_worksize = 0;
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
-              cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data, pool_workspace));
-      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward(
-          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
-          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
-          &beta, cudnn_input_desc, input_grad_data));
-#endif
-
-      if (data_format == str_NDHWC) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::CUDADeviceContext>();
-        std::vector<int> axis{0, 2, 3, 4, 1};
-        phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
-            trans5_v4;
-        trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
-      }
-#ifdef PADDLE_WITH_HIP
-      // MIOPEN not support NHWC data layout
-      if (data_format == str_NHWC) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::CUDADeviceContext>();
-        std::vector<int> axis{0, 2, 3, 1};
-        phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
-            trans4_v4;
-        trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis);
-      }
-#endif
-    }
-  }
-};
-
-template <typename T>
-class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    if (pooling_type == "max") {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Pool op grad grad only supports avgpool."));
-    } else {
-      PoolCUDNNOpKernel<T>::Compute(ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>);
-#else
-REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradGradOpKernel<float>,
-                   ops::PoolCUDNNGradGradOpKernel<double>,
-                   ops::PoolCUDNNGradGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>,
-                   ops::PoolCUDNNOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
-                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index ae095c2fa7aaa95cf667898b63a90988eb83caf0..44f3d8090e565c1581a49387db4b834b1abf8b62 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,6 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -23,125 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-int PoolOutputSize(int input_size, int filter_size, int padding_1,
-                   int padding_2, int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size =
-        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
-            stride +
-        1;
-  }
-  PADDLE_ENFORCE_GT(
-      output_size, 0,
-      platform::errors::InvalidArgument(
-          "the output size must be greater than 0. But received: "
-          "output_size = %d due to the settings of input_size(%d), "
-          "padding(%d,%d), "
-          "k_size(%d) and stride(%d). Please check again!",
-          output_size, input_size, padding_1, padding_2, filter_size, stride));
-  return output_size;
-}
-
-void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(
-      ctx->HasInput("X"), true,
-      platform::errors::NotFound("Input(X) of Pool operator is not found."));
-  PADDLE_ENFORCE_EQ(
-      ctx->HasOutput("Out"), true,
-      platform::errors::NotFound("Output(Out) of Pool operator is not found."));
-
-  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
-  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
-  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-  bool global_pooling = ctx->Attrs().Get<bool>("global_pooling");
-  std::string data_format = ctx->Attrs().Get<std::string>("data_format");
-  std::string padding_algorithm =
-      ctx->Attrs().Get<std::string>("padding_algorithm");
-
-  auto in_x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(
-      in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      platform::errors::InvalidArgument(
-          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
-          "received: %u-D Tensor and it's shape is [%s].",
-          in_x_dims.size(), in_x_dims));
-
-  PADDLE_ENFORCE_EQ(
-      in_x_dims.size() - ksize.size(), 2U,
-      platform::errors::InvalidArgument(
-          "the dimension of input minus the size of "
-          "Attr(ksize) must be euqal to 2 in Op(pool). "
-          "But received: the dimension of input minus the size "
-          "of Attr(ksize) is %d, the "
-          "input's dimension is %d, the shape of input "
-          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
-          ksize.size(), phi::make_ddim(ksize)));
-
-  PADDLE_ENFORCE_EQ(
-      ksize.size(), strides.size(),
-      platform::errors::InvalidArgument(
-          "the size of Attr(ksize) and Attr(strides) in "
-          "Op(pool) must be equal. "
-          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
-          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
-          ksize.size(), strides.size(), phi::make_ddim(ksize),
-          phi::make_ddim(strides)));
-
-  // MKL-DNN Kernels are using NCHW order of dims description
-  // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
-                            (data_format == "NHWC" || data_format == "NDHWC");
-
-  // update paddings if "SAME" or global_pooling
-  framework::DDim data_dims;
-  if (channel_last) {
-    data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-  } else {
-    data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-  }
-  UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                data_dims, strides, ksize);
-
-  if (global_pooling) {
-    UpdateKsize(&ksize, data_dims);
-  }
-
-  std::vector<int64_t> output_shape;
-  if (adaptive) {
-    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-  } else {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) {
-        output_shape.push_back(data_dims[i]);
-      } else {
-        output_shape.push_back(
-            PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i],
-                           paddings[2 * i + 1], strides[i], ceil_mode));
-      }
-    }
-  }
-
-  // output_N = input_N
-  output_shape.insert(output_shape.begin(), in_x_dims[0]);
-  // output_C = input_C
-  if (channel_last) {
-    output_shape.push_back(in_x_dims[in_x_dims.size() - 1]);
-  } else {
-    output_shape.insert(output_shape.begin() + 1, in_x_dims[1]);
-  }
-
-  ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-  ctx->ShareLoD("X", "Out");
-}
-
 bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
   if (ctx.Attr<bool>("adaptive") == false) return true;
   // (jczaja): oneDNN is supporting only unchangable in size pool window
@@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
                                  tensor.place(), tensor.layout());
 }
 
-void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    platform::errors::NotFound(
-                        "Input(X) of Pool Gradoperator is not found."));
-  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    platform::errors::NotFound(
-                        "Input(X@GRAD) of Pool Gradoperator is not found."));
-  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-}
-
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
@@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("pool2d_grad_grad");
+    grad_op->SetType("pool2d_double_grad");
     grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
     grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
     grad_op->SetAttrMap(this->Attrs());
@@ -692,35 +569,34 @@ Example:
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad,
+                            Pool2dDoubleGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+
 REGISTER_OPERATOR(
     pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    Pool2dInferShapeFunctor);
 REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad,
                   ops::Pool2dOpGradGradMaker<paddle::framework::OpDesc>,
-                  ops::Pool2dOpGradGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp);
-
-REGISTER_OP_CPU_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool2d_grad_grad,
-    ops::PoolGradGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradGradKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::Pool2dOpGradGradMaker<paddle::imperative::OpBase>,
+                  Pool2dGradInferShapeFunctor);
+REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp,
+                  Pool2dDoubleGradInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor,
+                            PD_INFER_META(phi::PoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor,
+                            PD_INFER_META(phi::PoolGradInferMeta));
 
 REGISTER_OPERATOR(
     pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    Pool3dInferShapeFunctor);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu
deleted file mode 100644
index 069ce0c1fda853b943a7b414a7a33d9aa6405a89..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_op.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool2d_grad_grad,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pool3d_grad,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index bea6506ee86dbfe3ac606a1e8e883bfbf2500f25..d48ac3bd358ef64271de69df4424399b427cfb82 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+// NOTE(Ruibiao): Difficult to remove code from this header file because too
+// many files rely on it through "mkldnn_reuse.h"
 
-#include <algorithm>
-#include <string>
-#include <vector>
+#pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__HIPCC__) || defined(__NVCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
@@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
@@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override;
 };
 
-template <typename T = int>
-inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
-                          const bool adaptive,
-                          const std::string padding_algorithm,
-                          const framework::DDim data_dims,
-                          const std::vector<T>& strides,
-                          const std::vector<T>& ksize) {
-  // set padding size == data_dims.size() * 2
-  auto data_shape = phi::vectorize<T>(data_dims);
-  if (static_cast<int>(paddings->size()) == data_dims.size()) {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      T copy_pad = *(paddings->begin() + 2 * i);
-      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
-    }
-  } else {
-    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
-                      platform::errors::InvalidArgument(
-                          "Paddings size %d should be the same or twice as the "
-                          "pooling size %d.",
-                          paddings->size(), data_dims.size() * 2));
-  }
-
-  // when padding_algorithm is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (int i = 0; i < data_dims.size(); ++i) {
-      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
-      T pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
-                   static_cast<T>(0));
-      T pad_0 = pad_sum / 2;
-      T pad_1 = pad_sum - pad_0;
-      *(paddings->begin() + i * 2) = pad_0;
-      *(paddings->begin() + i * 2 + 1) = pad_1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-
-  // if global_pooling == true or adaptive == true, padding will be ignore
-  if (global_pooling || adaptive) {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-}
-
-template <typename T = int>
-inline void UpdateKsize(std::vector<T>* ksize,
-                        const framework::DDim data_dims) {
-  ksize->resize(static_cast<size_t>(data_dims.size()));
-  for (size_t i = 0; i < ksize->size(); ++i) {
-    *(ksize->begin() + i) = static_cast<T>(data_dims[i]);
-  }
-}
-
-inline int getReduceNum(const framework::Tensor& input,
-                        const framework::Tensor* output,
-                        const std::string data_format,
-                        std::vector<int>* reduce_dim) {
-  // data_format only can be NCHW
-  bool channel_last = (data_format == "NHWC");
-  if (channel_last) {
-    return 0;
-  }
-  int reduce_num = 0;
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  if ((output_height == 1) && (output_width == 1)) {
-    reduce_dim->push_back(2);
-    reduce_dim->push_back(3);
-    reduce_num = input.dims()[2] * input.dims()[3];
-  }
-  return reduce_num;
-}
-
-template <typename DeviceContext, typename T>
-class PoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::string data_format = context.Attr<std::string>("data_format");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    bool global_pooling = context.Attr<bool>("global_pooling");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    switch (ksize.size()) {
-      case 2: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out, pool_process);
-
-        } else if (pooling_type == "avg") {
-          std::vector<int> reduce_dim;
-          int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim);
-          if (reduce_num > 0 &&
-              adaptive) {  // for adaptive_avg_pool2d && output_size == 1
-#if defined(__HIPCC__) || defined(__NVCC__)
-            auto stream = dev_ctx.stream();
-            TensorReduceImpl<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
-                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
-                reduce_dim, stream);
-#else  // for cpu
-            paddle::operators::math::Pool2dFunctor<
-                DeviceContext, paddle::operators::math::AvgPool<T>, T>
-                pool2d_forward;
-            paddle::operators::math::AvgPool<T> pool_process;
-            pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
-                           data_format, exclusive, adaptive, out, pool_process);
-#endif
-          } else {  // avgpool_2d or  adaptive_avg_pool2d && output_size != 1
-            paddle::operators::math::Pool2dFunctor<
-                DeviceContext, paddle::operators::math::AvgPool<T>, T>
-                pool2d_forward;
-            paddle::operators::math::AvgPool<T> pool_process;
-            pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
-                           data_format, exclusive, adaptive, out, pool_process);
-          }
-        }
-      } break;
-      case 3: {
-        if (pooling_type == "max") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::MaxPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out, pool_process);
-
-        } else if (pooling_type == "avg") {
-          paddle::operators::math::Pool3dFunctor<
-              DeviceContext, paddle::operators::math::AvgPool<T>, T>
-              pool3d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         exclusive, adaptive, out, pool_process);
-        }
-      } break;
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Pool op only supports 2D and 3D input."));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool exclusive = context.Attr<bool>("exclusive");
-    bool adaptive = context.Attr<bool>("adaptive");
-    std::string data_format = context.Attr<std::string>("data_format");
-    bool global_pooling = context.Attr<bool>("global_pooling");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    framework::DDim data_dims;
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-    }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
-    if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
-      for (int i = 0; i < data_dims.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-
-    if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));
-
-      switch (ksize.size()) {
-        case 2: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
-                pool2d_backward;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool2dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool2d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, exclusive, adaptive,
-                            in_x_grad, pool_process);
-          }
-        } break;
-        case 3: {
-          if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
-                pool3d_backward;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, in_x_grad);
-          } else if (pooling_type == "avg") {
-            paddle::operators::math::Pool3dGradFunctor<
-                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
-                pool3d_backward;
-            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, data_format, exclusive, adaptive,
-                            in_x_grad, pool_process);
-          }
-        } break;
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Pool op only supports 2D and 3D input."));
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PoolGradGradKernel : public PoolKernel<DeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
-    if (pooling_type == "max") {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Pool op grad grad only supports avgpool."));
-    } else {
-      PoolKernel<DeviceContext, T>::Compute(context);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index 08656e64231b61181583cb700f2cc3216e25e516..fa88d128a9a1d572414a6459933a8988cae1fda0 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
       data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
     }
 
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
     MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType<T>());
@@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
       data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
     }
 
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     if (global_pooling) {
-      UpdateKsize(&ksize, data_dims);
+      phi::funcs::UpdateKernelSize(&ksize, data_dims);
     }
 
     // inputs need with NHWC layout
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
index bd26d6350d9c300949edb1a90b244a7c747dd7a9..0efcb8b7981c32e9f8d5a04f4fd4122d6725a49e 100644
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pool_op.h"
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
       strides_vec[2] = strides[0];
       strides_vec[3] = strides[1];
     }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
         platform::errors::InvalidArgument(
@@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
       strides_vec[2] = strides[0];
       strides_vec[3] = strides[1];
     }
-    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
-                  data_dims, strides, ksize);
+    phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive,
+                              padding_algorithm, data_dims, strides, ksize);
 
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index 402dd6c10803947f73e593d215d28246a81c6706..87c437d8a78e0122b0fc4f5a7dbf51612e40fbf2 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pool_op.h"
+
 #include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive,
                               bool is_test) {
   if (pooltype == "max") {
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index e0c24935b47509dbe473a963240f4234e168a293..e0341f4a4b4716d0ee82c9437ddc4d8bd1e35fb2 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/pool_with_index_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,67 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of Pooling should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of Pooling should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Mask) of Pooling should not be null."));
-
-    auto in_x_dims = ctx->GetInputDim("X");
-
-    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-
-    PADDLE_ENFORCE(
-        in_x_dims.size() == 4 || in_x_dims.size() == 5,
-        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
-                                          "tensor but received %dD-Tensor",
-                                          in_x_dims.size()));
-
-    if (ctx->Attrs().Get<bool>("global_pooling")) {
-      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        in_x_dims.size() - ksize.size(), 2U,
-        platform::errors::InvalidArgument(
-            "The input size %d minus the kernel size %d should equal to 2.",
-            in_x_dims.size(), ksize.size()));
-    PADDLE_ENFORCE_EQ(
-        ksize.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "Strides size %d and pooling size %d should be the same.",
-            strides.size(), ksize.size()));
-    PADDLE_ENFORCE_EQ(
-        ksize.size(), paddings.size(),
-        platform::errors::InvalidArgument(
-            "Paddings size %d and pooling size %d should be the same.",
-            paddings.size(), ksize.size()));
-
-    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    if (adaptive) {
-      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
-    } else {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                 paddings[i], strides[i]));
-      }
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-    ctx->SetOutputDim("Mask", phi::make_ddim(output_shape));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -102,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Mask"), true,
-        platform::errors::InvalidArgument("Input(Mask) must not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument("Input(X) must not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "Output(X@GRAD) should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -331,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index,
+                            MaxPool2dWithIndexInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad,
+                            MaxPool2dWithIndexGradInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta));
+
 REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
                   ops::MaxPool2dWithIndexOpMaker,
                   ops::MaxPoolWithIndexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>,
+                  MaxPool2dWithIndexInferShapeFunctor);
 REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad,
-                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer);
+                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer,
+                  MaxPool2dWithIndexGradInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index,
+                            MaxPool3dWithIndexInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad,
+                            MaxPool3dWithIndexGradInferShapeFunctor,
+                            PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta));
 
 REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
                   ops::MaxPool3dWithIndexOpMaker,
                   ops::MaxPoolWithIndexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaxPoolWithIndexGradOpMaker<paddle::imperative::OpBase>,
+                  MaxPool3dWithIndexInferShapeFunctor);
 REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad,
-                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
-                                int>);
-REGISTER_OP_CPU_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>);
+                  ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer,
+                  MaxPool3dWithIndexGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
deleted file mode 100644
index 5497dcbd9ce255f833df24989d7a76c40bcbca06..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/pool_with_index_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
-
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
-                                int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
-                                int>);
-REGISTER_OP_CUDA_KERNEL(
-    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
-                                    int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
deleted file mode 100644
index 6e51a833f5c89efc2621c0ccc3d08dc42b2733a1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    Tensor* mask = context.Output<Tensor>("Mask");
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-      }
-    }
-
-    switch (ksize.size()) {
-      case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
-                                                           T2>
-            pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
-                       mask);
-      } break;
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Pool op only supports 2D and 3D input."));
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T1, typename T2>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* mask = context.Input<Tensor>("Mask");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
-    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
-      }
-    }
-
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      phi::funcs::set_constant(device_ctx, in_x_grad, 0);
-
-      switch (ksize.size()) {
-        case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool2d_backward;
-          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
-                                                                 T1, T2>
-              pool3d_backward;
-          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, adaptive, in_x_grad);
-        } break;
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Pool op only supports 2D and 3D input."));
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index da637dfeb237dd4f17816e784882720dc2f2ff64..cfacffff234105ac9c6dc41b86f06594d319dcbb 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
 class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(ROIs) of PSROIPoolOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of PSROIPoolOp should not be null."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], 4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    int output_channels = ctx->Attrs().Get<int>("output_channels");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_EQ(
-        input_dims[1], output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "the channel of X(%d) "
-            "should be equal to the product of "
-            "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
-            input_dims[1], output_channels, pooled_height, pooled_width));
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(output_channels, 1,
-                      platform::errors::InvalidArgument(
-                          "The pooled output channels must greater than 1"));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] =
-        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of Out should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                      platform::errors::InvalidArgument(
-                          "The gradient of X should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor,
+                            PD_INFER_META(phi::PsroiPoolGradInferMeta));
 REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
                   ops::PSROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    psroi_pool_grad,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::PSROIPoolGradMaker<paddle::imperative::OpBase>,
+                  PsroiPoolInferShapeFunctor);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp,
+                  PsroiPoolGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
deleted file mode 100644
index c1917501db8b5afebf4b7951b0f04de69758b49d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T outsum = 0;
-
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        outsum += offset_input_data[input_index];
-      }
-    }
-
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    output_data[i] = is_empty ? 0. : outsum / bin_area;
-  }
-}
-
-template <typename T>
-__global__ void GPUPSROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad_data,
-    const float spatial_scale, const int input_channels, const int height,
-    const int width, const int output_channels, const int pooled_height,
-    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-    T roi_end_w =
-        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    T roi_end_h =
-        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Accumulate diff_val into input data
-    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-    for (int ih = hstart; ih < hend; ++ih) {
-      for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
-        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    PADDLE_ENFORCE_EQ(
-        input_channels, output_channels * pooled_height * pooled_width,
-        platform::errors::InvalidArgument(
-            "The channels %d of input X should equal the product of "
-            "output_channels %d x pooled_height %d x pooled_width %d.",
-            input_channels, output_channels, pooled_height, pooled_width));
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-    int rois_batch_size;
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                   rois_num_data, sizeof(int) * rois_batch_size, 0);
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_list[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be "
-              "the same but received batch size of input(ROIs) and "
-              "input(X) is %d and %d respectively.",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-
-      // set rois batch id
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    // call cuda kernel function
-    GPUPSROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
-        input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad) {
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(),
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPSROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<T>(), output_grad->data<T>(),
-            spatial_scale, input_channels, height, width, output_channels,
-            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
-            input_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    psroi_pool_grad,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
deleted file mode 100644
index 3f020d93391b0e648898c1b83858a7bd9809aa03..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto output_channels = ctx.Attr<int>("output_channels");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      platform::errors::InvalidArgument(
-                          "the channels of input "
-                          "X should equal the product of "
-                          "output_channels x pooled_height x pooled_width"));
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      int rois_num_count = 0;
-      for (int i = 0; i < rois_batch_size; ++i) {
-        rois_num_count += rois_num_data[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          rois_num_count, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and RoisNum must be the same"));
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
-                                            "batch_size should be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod, rois_num,
-          platform::errors::InvalidArgument(
-              "the rois_num from input and lod must be the same"));
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-
-    // calculate psroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w =
-          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-      T roi_start_h =
-          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-      T roi_end_w =
-          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-      T roi_end_h =
-          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-      // Force too small rois to be 1 x 1
-      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-      // Compute bin size w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
-            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
-            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-            //  Add roi offsets and clip to input boundaries
-            hstart = std::min(std::max(hstart, 0), height);
-            wstart = std::min(std::max(wstart, 0), width);
-            hend = std::min(std::max(hend, 0), height);
-            wend = std::min(std::max(wend, 0), width);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T out_sum = 0.;
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            for (int ih = hstart; ih < hend; ++ih) {
-              for (int iw = wstart; iw < wend; ++iw) {
-                int input_index = ih * in_stride[2] + iw;
-                out_sum += offset_input_data[input_index];
-              }
-            }
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto output_channels = ctx.Attr<int>("output_channels");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad) {
-      auto in_dims = in->dims();
-      int input_channels = in_dims[1];
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      framework::Tensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
-               static_cast<T>(0));
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w =
-            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
-        T roi_start_h =
-            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
-        T roi_end_w =
-            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-        T roi_end_h =
-            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-        // Force too small ROIs to be 1x1
-        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
-        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
-        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
-        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
-        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
-
-        // Add roi offsets and clip to input boundaries
-        hstart = std::min(std::max(hstart, 0), height);
-        hend = std::min(std::max(hend, 0), height);
-        wstart = std::min(std::max(wstart, 0), width);
-        wend = std::min(std::max(wend, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        // Accumulate diff_val into input data
-        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
-        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
-        for (int ih = hstart; ih < hend; ++ih) {
-          for (int iw = wstart; iw < wend; ++iw) {
-            int input_index = ih * width + iw;
-            offset_input_grad_data[input_index] += diff_val;
-          }
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
index 6b0d6f332bcae8890cdfaccb1244886daa63ae42..54e31845ad4bd5ddfa81bc90a10391f027dffc11 100644
--- a/paddle/fluid/operators/put_along_axis_op.cc
+++ b/paddle/fluid/operators/put_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/put_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker,
                   paddle::operators::PutAlongAxisInplaceInferer);
 
 REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel<float>,
-                       ops::PutAlongAxisOpKernel<double>,
-                       ops::PutAlongAxisOpKernel<int>,
-                       ops::PutAlongAxisOpKernel<uint8_t>,
-                       ops::PutAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis_grad,
-                       ops::PutAlongAxisGradOpKernel<float>,
-                       ops::PutAlongAxisGradOpKernel<double>,
-                       ops::PutAlongAxisGradOpKernel<int>,
-                       ops::PutAlongAxisGradOpKernel<uint8_t>,
-                       ops::PutAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
deleted file mode 100644
index 5508023efad2c60a00f5ea3a8d1b853c6e5ba1fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/put_along_axis_op.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/put_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PutAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisCUDAKernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_input_grad_kernel<T, int32_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        gpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_gather_kernel<T, int32_t>(
-            *result_grad, axis, *index, *value_grad,
-            ctx.device_context());  // the gradient of scatter is gather
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel<float>,
-                        ops::PutAlongAxisCUDAKernel<double>,
-                        ops::PutAlongAxisCUDAKernel<int64_t>,
-                        ops::PutAlongAxisCUDAKernel<int>,
-                        ops::PutAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(put_along_axis_grad,
-                        ops::PutAlongAxisGradOpCUDAKernel<float>,
-                        ops::PutAlongAxisGradOpCUDAKernel<double>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int>,
-                        ops::PutAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
deleted file mode 100644
index 38487f5ce28c9e35dd6e84403b88dbc0fdfa07b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/put_along_axis_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class PutAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisOpKernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce "
-          "op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpKernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_input_grad_kernel<T, int32_t>(
-            // Here passing an unused argument *result_grad, because it's
-            // convenient to instantiate a bunch of template function with the
-            // same arguments list.
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        cpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_gather_kernel<T, int32_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 24741efe426b18b7cecae9332c522d67aee98d63..c7e91ba35dee1356ddd71ade0fe9892f8032c77b 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 21c23a7f602a35acf676e97a9134c2c43a73126c..4b6759ea165edf29add66ee44461fdd4d9f84d00 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -70,9 +70,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  if (platform::is_mlu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::MluEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
+  mlu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::NPUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+    if (platform::is_mlu_place(place_)) {
+      TensorVec &mlu = mlu_buffer_[i];
+      if (mlu.empty()) {
+        mlu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mlu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on MLU and CPU devices are not matched. "
+                "The number on MLU is %d, on CPU is %d",
+                mlu.size(), cpu.size()));
+      }
+
+      std::vector<void *> mlu_ptrs;
+      mlu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        mlu[i].Resize(cpu[i].dims());
+        mlu[i].set_layout(cpu[i].layout());
+        mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetMLUDeviceId(place_.device);
+      PADDLE_ENFORCE_MLU_SUCCESS(
+          cnPlaceNotifier(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto mlu_ptr = mlu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        if ((platform::is_mlu_place(cpu_place))) {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+        } else {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+          platform::MLUStreamSync(stream_.get());
+        }
+        mlu[i].set_lod(cpu[i].lod());
+      }
+      platform::MLUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(cuda_buffer_[i]);
   } else if (platform::is_npu_place(place_)) {
     *out = std::move(npu_buffer_[i]);
+  } else if (platform::is_mlu_place(place_)) {
+    *out = std::move(mlu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 3d42486c6df8815aaab8e55e29898700bb74d953..f0f3b6b7f9fdfeb69c46e7122fae5c6cfbf3a169 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -29,6 +29,11 @@
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
+  std::vector<TensorVec> mlu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::NpuStreamObject> stream_;
   std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  mluStream compute_stream_;
+  std::shared_ptr<platform::MluStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::MluEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 28a8484f539fc94d055aaf9fcbc0a420747d0964..18e444702fbb2cc19912a32587f96330e6e8632d 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
-                            PT_INFER_META(phi::RealAndImagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
index e8e4ff7010d3df01cda514d51796b789ef5e1da6..a724524716be39e554c6046ca809624b7fbb053a 100644
--- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
+++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
@@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) {
       }
 
       if (is_valid) {
-        phi::kernels::details::CheckReduceRank(reduce_rank, rank);
+        phi::funcs::details::CheckReduceRank(reduce_rank, rank);
       } else {
-        ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank),
+        ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank),
                      paddle::platform::EnforceNotMet);
       }
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 955cf8d4448c1b23319fa3e0c10dbd12ae3bf49c..9115d21b195e1b615f43b01af61bbdebd1e70294 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +32,17 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+class ReduceAllOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_all"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_all"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_all,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AllFunctor>);
+REGISTER_OPERATOR(
+    reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAllInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index fa3800dd3c9e46c20df54d748a61166a75be492b..69561b93498883bdf2adcfa3982d24bc1e727be0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +31,18 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+class ReduceAnyOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_any"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_any"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_any,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AnyFunctor>);
+REGISTER_OPERATOR(
+    reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAnyInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d057ee8f5d798f61c13d5c5c166c9d71b6716d6f..e327d19ab3be8daff08b4e358081d2792fd30835 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -35,7 +35,7 @@ namespace p = paddle::platform;
 
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(reduce_any);
+USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index cb438b4a8057267015c8b3c15dd8468fca5a4b44..41df8e4a15f093a40a31c70eea98dfb7e575f4cd 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -14,15 +14,28 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_max);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MaxFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMaxOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_max"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_max"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_max, ops::ReduceOp, ReduceMaxOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMaxInferShapeFunctor);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
+
 REGISTER_OP_CPU_KERNEL(
     reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                            float, ops::MaxOrMinGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
deleted file mode 100644
index 8194805ddc3736b365667883447cc13d7b729494..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-// reduce_max
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max,
-    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 6157a3a925de51a9b65efbb2df9d5178132b1baf..4a18330913803f822436118a35fb957b7e31b391 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -96,8 +96,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
 };
 
-DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
-                            PT_INFER_META(phi::MeanRawInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
 REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
                   ops::ReduceMeanOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
index 11aa78382e319331dc65ec22927f0d5762adfb43..b9915f2b484f140bfd776b64459a19c6788a55c9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -14,15 +14,28 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_min);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MinFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMinOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_min"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_min"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_min, ops::ReduceOp, ReduceMinOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMinInferShapeFunctor);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
+
 REGISTER_OP_CPU_KERNEL(
     reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                            float, ops::MaxOrMinGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
deleted file mode 100644
index 44548b8d2e778e4a570d085be6f2538b64ab7824..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-// reduce_min
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min,
-    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 3aab906804f7adb95f80aa2675f01217b0b48d39..160617695338a9f2e140b7b418c93ef0d7c57e17 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -23,8 +23,7 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 namespace paddle {
 namespace operators {
 
@@ -37,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
                       gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
-  phi::kernels::TensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+  phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
       static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
-      origin_reduce_dims, stream);
+      origin_reduce_dims);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 8ef0712dc7a757dfe91e48e7b0bb32f24840e02e..2a78774f3706e73bd8931e80fe020faac58d7ff5 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -102,8 +102,8 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_sum"; }
 };
 
-DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
-                            PT_INFER_META(phi::ReduceInferMetaBase));
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
+                            PD_INFER_META(phi::SumRawInferMeta));
 
 REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumVarTypeInference,
@@ -114,16 +114,3 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceSumGradKernel =
-    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
-                             ops::SumGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum_grad, CPUReduceSumGradKernel<bool>,
-    CPUReduceSumGradKernel<float>, CPUReduceSumGradKernel<double>,
-    CPUReduceSumGradKernel<paddle::platform::float16>,
-    CPUReduceSumGradKernel<int>, CPUReduceSumGradKernel<int64_t>,
-    CPUReduceSumGradKernel<paddle::platform::complex<float>>,
-    CPUReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
deleted file mode 100644
index 2f6bf127518090916c4b947daf1d1f202fdd5960..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-template <typename T>
-using CUDAReduceSumGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::IdentityFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
-    CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
-    CUDAReduceSumGradKernel<paddle::platform::float16>,
-    CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
-    CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
-    CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index b636184ae457edf5c8028fecfb92a3ea96f5a0d9..a473b54c1f855945a5f3f0ac8d0826b15494ba1a 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <type_traits>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/unique_op.h"
@@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 using TensorList = std::vector<framework::Tensor>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)                      \
   inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \
     const std::string& mode = ctx.Attr<std::string>("mode");           \
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 5627b4f229e100d9979663e8688b8694188bab0f..ac0cd75237baf5e8b860f197d42cd27bae65270e 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -226,11 +226,7 @@ REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                   ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                   ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_CPU_KERNEL(
     roi_align_grad,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 18941d10e937d3c28e5793384f00d9d97225a128..1a2e64cd45ca401f5fb8ca6b6975a029ba735280 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -33,43 +33,6 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-template <class T>
-__device__ T BilinearInterpolate(const T* input_data, const int height,
-                                 const int width, T y, T x) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return 0;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  int y_low = static_cast<int>(y);
-  int x_low = static_cast<int>(x);
-  int y_high;
-  int x_high;
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  T v1 = input_data[y_low * width + x_low];
-  T v2 = input_data[y_low * width + x_high];
-  T v3 = input_data[y_high * width + x_low];
-  T v4 = input_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
 template <class T>
 __device__ void BilinearInterpolateGradient(const int height, const int width,
                                             T y, T x, T* w1, T* w2, T* w3,
@@ -102,65 +65,6 @@ __device__ void BilinearInterpolateGradient(const int height, const int width,
   return;
 }
 
-template <class T>
-__global__ void GPUROIAlignForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, int* roi_batch_id_data, T* output_data,
-    const bool continuous_coordinate) {
-  CUDA_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
-    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = roi_ymax - roi_ymin;
-    if (!continuous_coordinate) {
-      roi_width = max(roi_width, static_cast<T>(1.));
-      roi_height = max(roi_height, static_cast<T>(1.));
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
-    T output_val = 0;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-    output_data[i] = output_val;
-  }
-}
-
 template <typename T>
 __global__ void GPUROIAlignBackward(
     const int nthreads, const T* input_rois, const T* out_grad,
@@ -236,105 +140,6 @@ __global__ void GPUROIAlignBackward(
   }
 }
 
-template <typename Place, typename T>
-class GPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-#ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256);
-#endif
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size, batch_size));
-
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(
-          lod.empty(), false,
-          platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
-                                            "not contain LoD information."));
-      auto rois_lod = lod.back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and batch size "
-              "of images must be the same. But received rois batch size = %d, "
-              "and images batch size = %d",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()), aligned);
-  }
-};
-
 template <typename Place, typename T>
 class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
  public:
@@ -416,10 +221,6 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_align,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     roi_align_grad,
     ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index e71099ed99f00f5846e6e23d5d39b3b2f8997531..589e35e4ab7ae4caf5efd3fb4d93a26b2ca86b26 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -23,152 +23,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-namespace {  // NOLINT
-constexpr size_t get_offset(size_t x, size_t y, size_t width) {
-  return y * width + x;
-}
-
-template <class T>
-struct offsets_and_ratios {
-  offsets_and_ratios() = default;
-  offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy,
-                     std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio,
-                     T XY_ratio)
-      : xy(xy),
-        xY(xY),
-        Xy(Xy),
-        XY(XY),
-        xy_ratio(xy_ratio),
-        xY_ratio(xY_ratio),
-        Xy_ratio(Xy_ratio),
-        XY_ratio(XY_ratio) {}
-
-  std::size_t xy = 0;
-  std::size_t xY = 0;
-  std::size_t Xy = 0;
-  std::size_t XY = 0;
-  T xy_ratio = 0.0f;
-  T xY_ratio = 0.0f;
-  T Xy_ratio = 0.0f;
-  T XY_ratio = 0.0f;
-};
-
-template <typename T>
-std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
-    std::size_t width, std::size_t height, const T roi_width,
-    const T roi_height, const T roi_xmin, const T roi_ymin,
-    std::size_t pooled_width, std::size_t roi_bin_grid_w,
-    std::size_t pooled_height, std::size_t roi_bin_grid_h) {
-  const auto ind_num =
-      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
-
-  std::vector<offsets_and_ratios<T>> interpolation_cords;
-  interpolation_cords.reserve(ind_num);
-
-  const auto bin_w = roi_width / pooled_width;
-  const auto bin_h = roi_height / pooled_height;
-
-  for (std::size_t py = 0; py < pooled_height; py++) {
-    for (std::size_t px = 0; px < pooled_width; px++) {
-      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
-        // calculate x of sample points
-        auto y =
-            roi_ymin +
-            bin_h * (py +
-                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
-        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
-          // calculate x of sample points
-          auto x = roi_xmin +
-                   bin_w * (px +
-                            static_cast<T>(ix + .5f) /
-                                static_cast<T>(roi_bin_grid_w));
-
-          // deal with elements out of map
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            interpolation_cords.emplace_back();
-            continue;
-          }
-          y = y <= 0 ? 0 : y;
-          x = x <= 0 ? 0 : x;
-
-          std::size_t x_low_index = static_cast<std::size_t>(x);
-          std::size_t x_high_index;
-          if (x_low_index >= width - 1) {
-            x_high_index = x_low_index = width - 1;
-            x = static_cast<T>(x_low_index);
-          } else {
-            x_high_index = x_low_index + 1;
-          }
-          T x_ratio = x_high_index - x;
-
-          std::size_t y_low_index = static_cast<std::size_t>(y);
-          std::size_t y_high_index;
-          if (y_low_index >= height - 1) {
-            y_high_index = y_low_index = height - 1;
-            y = static_cast<T>(y_low_index);
-          } else {
-            y_high_index = y_low_index + 1;
-          }
-          T y_ratio = y_high_index - y;
-
-          auto xy = get_offset(x_low_index, y_low_index, width);
-          auto xY = get_offset(x_low_index, y_high_index, width);
-          auto Xy = get_offset(x_high_index, y_low_index, width);
-          auto XY = get_offset(x_high_index, y_high_index, width);
-
-          auto xy_ratio = x_ratio * y_ratio;
-          auto xY_ratio = x_ratio * (1 - y_ratio);
-          auto Xy_ratio = (1 - x_ratio) * y_ratio;
-          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
-
-          interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio,
-                                           Xy_ratio, XY_ratio);
-        }
-      }
-    }
-  }
-  return interpolation_cords;
-}  // namespace
-
-template <typename T>
-void interpolate(std::vector<T>& interpolated_values,  // NOLINT
-                 const std::vector<offsets_and_ratios<T>>& interpolation_cords,
-                 const T* data) {
-  for (auto& ic : interpolation_cords) {
-    auto xlyl_offset = ic.xy;
-    auto xhyl_offset = ic.Xy;
-    auto xlyh_offset = ic.xY;
-    auto xhyh_offset = ic.XY;
-
-    auto xlyl_ratio = ic.xy_ratio;
-    auto xhyl_ratio = ic.Xy_ratio;
-    auto xlyh_ratio = ic.xY_ratio;
-    auto xhyh_ratio = ic.XY_ratio;
-
-    interpolated_values.emplace_back(
-        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
-        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
-  }
-}
-
-template <typename T>
-void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
-              int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width,
-              int pooled_height) {
-  const auto data_amount = pooled_width * pooled_height;
-  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
-  const T count = 1.0 / grid_points;
-  auto val_begin = interpolated_values.cbegin();
-  for (auto i = 0; i < data_amount; ++i) {
-    T sum = 0.0;
-    auto val_end = val_begin + grid_points;
-    sum = std::accumulate(val_begin, val_end, sum);
-    val_begin = val_end;
-    output_data[i] = sum * count;
-  }
-}
-}  // NOLINT
-
 template <class T>
 void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
                                    const T out_grad_this_bin, const T count,
@@ -213,129 +67,6 @@ void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
   }
 }
 
-template <typename DeviceContext, typename T>
-class CPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and the batch size of images "
-              " must be the same. But received the batch size of rois is %d, "
-              "and the batch size of images is %d",
-              rois_batch_size, batch_size));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(lod.empty(), false,
-                        platform::errors::InvalidArgument(
-                            "Input(ROIs) Tensor of ROIAlignOp "
-                            "does not contain LoD information."));
-      auto rois_lod = lod.back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size, batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* rois_data = rois->data<T>();
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      int roi_bin_grid_h = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_height / pooled_height);
-      int roi_bin_grid_w = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_width / pooled_width);
-
-      auto interpolation_cords = get_indexes_and_ratios(
-          width, height, roi_width, roi_height, roi_xmin, roi_ymin,
-          pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h);
-
-      std::vector<T> interpolated_values;
-      interpolated_values.reserve(interpolation_cords.size());
-      for (auto channel = 0; channel < channels; ++channel) {
-        interpolate(interpolated_values, interpolation_cords, batch_data);
-        avg_pool(interpolated_values, output_data, roi_bin_grid_w,
-                 roi_bin_grid_h, pooled_width, pooled_height);
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        interpolated_values.clear();
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-
 template <typename DeviceContext, typename T>
 class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index d5b63854d99053ac0620a32cfaba267c7262d515..78509e4299b80ee44610ce3d10f9c57afa0cde18 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_align_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 09d2d906653e8c71ddeca7fa606cf5adac8cc596..13490d6fcde3a22e7299db21969d7de6f9a6582c 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
 template <typename DeviceContext, typename T>
 class XPUROIAlignOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 6da73c99068bc0e0453dfdd1b5eca8e1add1954b..7fe6623dcca14afc8fafc4875ccfb7546e4456f0 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -38,7 +38,8 @@ class SaveCombineOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
-    return expected_kernel_type;
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place());
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index e4410b21b541320c1d39c3ad155dfce6f74b7dc2..cbf2b9152079e13acd4a221ece402b946b844999 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index bb02bb541e14f551bb749c890877e4753d225c3c..0ae0e1500c16627fc269b31c57b25c47055d7d34 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,73 +27,6 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Index"), true,
-        platform::errors::InvalidArgument(
-            "Input(Index) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Updates"), true,
-        platform::errors::InvalidArgument(
-            "Input(Updates) of ScatterNdAddOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of ScatterNdAddOp should not be null."));
-
-    auto ref_dims = ctx->GetInputDim("X");
-    auto ref_dims_size = ref_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto updates_dims_size = updates_dims.size();
-
-    PADDLE_ENFORCE_LE(
-        index_dims[index_dims_size - 1], ref_dims_size,
-        platform::errors::InvalidArgument(
-            "The last dimension of Input(Index)'s shape should be no greater "
-            "than the rank of Input(X), but received the last dimension of "
-            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
-            index_dims[index_dims_size - 1], ref_dims_size));
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1, "
-                          "but received the rank of Input(Index) is %d.",
-                          index_dims_size));
-
-    // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
-    std::vector<int64_t> r_updates_dims;
-    for (int64_t i = 0; i < index_dims_size - 1; ++i) {
-      r_updates_dims.emplace_back(index_dims[i]);
-    }
-    for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
-      r_updates_dims.emplace_back(ref_dims[i]);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument(
-            "Updates has wrong shape. The shape of Updates and Input(Updates) "
-            "should be same, but received the shape of Updates is %d, "
-            "the shape of Input(Updates) is %d.",
-            r_updates_dims.size(), updates_dims_size));
-
-    for (int64_t i = 0; i < updates_dims_size; ++i) {
-      PADDLE_ENFORCE_EQ(
-          r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument(
-              "Updates has wrong shape. The dimensions of Updates and "
-              "Input(Updates) should match, but received Updates's"
-              "%d-th dimension is %d, Input(Updates)'s %d-th "
-              "dimension is %d.",
-              i, r_updates_dims[i], i, updates_dims[i]));
-    }
-    ctx->SetOutputDim("Out", ref_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,7 +35,8 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
                       platform::errors::InvalidArgument(
                           "Ref and Updates must have same type"));
     return framework::OpKernelType(
-        framework::TransToProtoVarType(ctx.Input<Tensor>("X")->type()),
+        framework::TransToProtoVarType(
+            ctx.Input<framework::Tensor>("X")->type()),
         ctx.device_context());
   }
 };
@@ -108,17 +45,6 @@ class ScatterNdAddGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -193,22 +119,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterNdAddInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
+                            ScatterNdAddGradInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterNdAddGradInferMeta));
+
 REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker,
                   ops::ScatterNdAddGradMaker<paddle::framework::OpDesc>,
-                  ops::ScatterNdAddGradMaker<paddle::imperative::OpBase>);
+                  ops::ScatterNdAddGradMaker<paddle::imperative::OpBase>,
+                  ScatterNdAddInferShapeFunctor);
 
 REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp,
-                  ops::ScatterNdAddGradNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel<float>,
-                       ops::ScatterNdAddOpKernel<double>,
-                       ops::ScatterNdAddOpKernel<int64_t>,
-                       ops::ScatterNdAddOpKernel<int>,
-                       ops::ScatterNdAddOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad,
-                       ops::ScatterNdAddGradientOpKernel<float>,
-                       ops::ScatterNdAddGradientOpKernel<double>,
-                       ops::ScatterNdAddGradientOpKernel<int64_t>,
-                       ops::ScatterNdAddGradientOpKernel<int>,
-                       ops::ScatterNdAddGradientOpKernel<uint8_t>);
+                  ops::ScatterNdAddGradNoNeedBufferVarsInferer,
+                  ScatterNdAddGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
deleted file mode 100644
index 2fe3fcb759d348b36cd6a7a2609bea210d24705f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter_nd_add_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s], but "
-            "desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-    } else {
-      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      auto &dev_ctx = ctx.cuda_device_context();
-      // Gradient by Gather
-      const auto &index_type = Ids->dtype();
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, float>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, double>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, int>,
-                        ops::ScatterNdAddGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
deleted file mode 100644
index 81c95fe55abaad2e126a52ac7ab97dea24fe67f0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterNdAddOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-
-    // In place output: Out = X
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s], but "
-            "desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-    } else {
-      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-    }
-  }
-};
-
-template <typename T>
-class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Index");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-    }
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = Ids->dtype();
-      auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::CPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3174f07e96e227c8a2f1103d3d6664673c7a2d56..5f6b04cf59e0e3c8c05d44ad6c4a3321ff2516e4 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,46 +26,6 @@ class ScatterOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Updates) of ScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of ScatterOp should not be null."));
-
-    auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of Input(Ids)'s shape should be equal to 1, but "
-            "received the rank of Input(Ids) is %d.",
-            ctx->GetInputDim("Ids").size()));
-    PADDLE_ENFORCE_EQ(
-        ref_dims.size(), updates_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Updates) should have the same shape size, "
-            "but received the size of Input(x)'s shape is %d, the size of "
-            "Input(Updates)'s shape is %d.",
-            ref_dims.size(), updates_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
-        platform::errors::InvalidArgument(
-            "Input(Updates) and Input(Ids) should have same batch-size, but"
-            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
-            "batch-size is %d.",
-            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
-    ctx->SetOutputDim("Out", ref_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -76,17 +39,6 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Updates"))) {
-      ctx->SetOutputDim(framework::GradVarName("Updates"),
-                        ctx->GetInputDim("Updates"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -151,17 +103,17 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterGradInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
                   ops::ScatterGradMaker<paddle::framework::OpDesc>,
                   ops::ScatterGradMaker<paddle::imperative::OpBase>,
-                  ops::ScatterInplaceInferer);
+                  ops::ScatterInplaceInferer, ScatterInferShapeFunctor);
 REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
-                  ops::ScatterGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>,
-                       ops::ScatterOpKernel<double>, ops::ScatterOpKernel<int>,
-                       ops::ScatterOpKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>,
-                       ops::ScatterGradientOpKernel<double>,
-                       ops::ScatterGradientOpKernel<int>,
-                       ops::ScatterGradientOpKernel<int64_t>);
+                  ops::ScatterGradNoNeedBufferVarsInferer,
+                  ScatterGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
deleted file mode 100644
index 7755e376bc1956a1f9e09dc2eb8aead9fa083157..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    framework::TensorCopy(*X, ctx.GetPlace(), Out);
-    // use template class to support int32_t and int64_t
-    auto index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::GPUScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out,
-                                               overwrite);
-    } else {
-      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out,
-                                               overwrite);
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.cuda_device_context();
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
-      } else {
-        phi::funcs::GPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
-      }
-    }
-
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::GPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>,
-                        ops::ScatterOpCUDAKernel<double>,
-                        ops::ScatterOpCUDAKernel<int>,
-                        ops::ScatterOpCUDAKernel<int64_t>,
-                        ops::ScatterOpCUDAKernel<paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    scatter_grad, ops::ScatterGradOpCUDAKernel<float>,
-    ops::ScatterGradOpCUDAKernel<double>, ops::ScatterOpCUDAKernel<int>,
-    ops::ScatterOpCUDAKernel<int64_t>,
-    ops::ScatterGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
deleted file mode 100644
index 7733181a93fb60c116ff3da964336b0a85d9a84c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scatter_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class ScatterOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *Out = ctx.Output<Tensor>("Out");
-    double overwrite = ctx.Attr<bool>("overwrite");
-
-    // In place output: Out = X, Out[Ids] = Updates
-    framework::TensorCopy(*X, ctx.GetPlace(), Out);
-    // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "Index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s].",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (overwrite) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-      } else {
-        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-      }
-    } else {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
-      }
-    }
-  }
-};
-
-template <typename T>
-class ScatterGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Ids = ctx.Input<Tensor>("Ids");
-    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "scatter_op index holds the wrong type, it holds [%s],"
-            "but desires to be [%s] or [%s]",
-            index_type, phi::DataType::INT32, phi::DataType::INT64));
-
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (dX) {
-      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::CPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
-      } else {
-        phi::funcs::CPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
-      }
-    }
-
-    if (dUpdates) {
-      dUpdates->mutable_data<T>(ctx.GetPlace());
-      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::CPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      } else {
-        phi::funcs::CPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index fa5f03a092882ec1f63e9556bc38d94ed40c9a7f..d5ef95269b48a1a7e7b9c3e75af4f9b595580ad3 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
index 9f0b74e8a3f80c5c8a22c2db109f75e6ee316be1..07dd2f2d85fe9ac330be1f85d283c85207b1b78c 100644
--- a/paddle/fluid/operators/scatter_op_xpu.cc
+++ b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index bbd5b9c4e7db914d63c9c803c52d44f9350c1d41..d0290795455db1546afbda80e71e79de3f1020ac 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/searchsorted_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -117,10 +116,3 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    searchsorted,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SearchSortedKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 322cd97f01c3ad97ba74f049696fdec592ee524e..9d4c8532a82c064b1b7aef759934ad8dad894ec5 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/segment_pool_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
-                   "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
-    auto dims = ctx->GetInputDim("X");
-    dims[0] = -1;
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
-                     "SegmentPool");
-      ctx->SetOutputDim("SummedIds", {-1, 1});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor,
+                            PD_INFER_META(phi::SegmentPoolInferMeta));
+
 REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
                   ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>,
+                  SegmentPoolInferShapeFunctor);
 REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
deleted file mode 100644
index e147e62a98354087121ca1443b20d9163ef00f73..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
deleted file mode 100644
index 2f5ef7f54f988884a25feba4665283d3ce260988..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/segment_pool_op.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename IndexT>
-void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
-  auto* input = context.Input<Tensor>("X");
-  auto* segment = context.Input<Tensor>("SegmentIds");
-  auto* output = context.Output<Tensor>("Out");
-  std::string pooltype = context.Attr<std::string>("pooltype");
-  Tensor* summed_ids = nullptr;
-
-  int64_t num_indices = segment->numel();
-  PADDLE_ENFORCE_EQ(
-      num_indices, input->dims()[0],
-      platform::errors::InvalidArgument(
-          "Segment_ids should be the same size as dimension 0 of input X."));
-  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
-                    platform::errors::InvalidArgument(
-                        "Segment_ids should be 1-D tensor, or it's other "
-                        "dimension size is 1. Segment_ids's shape is: [%s].",
-                        segment->dims()));
-
-  if (input->numel() == 0 || segment->numel() == 0) {
-    return;
-  }
-
-  bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU;
-  if (cpu_place) {
-    auto dims = input->dims();
-    auto* segment_ids = segment->data<IndexT>();
-    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
-    PADDLE_ENFORCE_GT(
-        dims[0], 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", dims[0]));
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, output, static_cast<T>(0));
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (!cpu_place) {
-    Tensor length;
-    length.mutable_data<IndexT>(phi::make_ddim({1}), platform::CPUPlace());
-    IndexT* length_data = length.data<IndexT>();
-    const IndexT* segment_ids = segment->data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                  hipMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                   cudaMemcpyDeviceToHost));
-#endif
-
-    IndexT length_host = length_data[0];
-    length_host++;
-    PADDLE_ENFORCE_GT(
-        length_host, 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", length_data[0]));
-    auto dims = input->dims();
-    dims[0] = static_cast<int64_t>(length_host);
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    T init_value = 0;
-    if (pooltype == "MAX") {
-      init_value = static_cast<T>(-FLT_MAX);
-    } else if (pooltype == "MIN") {
-      init_value = static_cast<T>(FLT_MAX);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> setconst;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    setconst(dev_ctx, output, static_cast<T>(init_value));
-    // the gpu kernel of mean pool record the counts of segment_ids
-    if (pooltype == "MEAN") {
-      summed_ids = context.Output<Tensor>("SummedIds");
-      summed_ids->Resize({dims[0], 1});
-      summed_ids->mutable_data<T>(context.GetPlace());
-      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
-    }
-  }
-#endif
-
-  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
-
-  pool(context.template device_context<DeviceContext>(), *input, *segment,
-       output, summed_ids, pooltype);
-}
-
-template <typename DeviceContext, typename T>
-class SegmentPoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SegmentPoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Input<Tensor>("Out");
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    const Tensor* summed_ids = nullptr;
-    if (pooltype == "MEAN") {
-      summed_ids = context.Input<Tensor>("SummedIds");
-    }
-
-    in_g->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 88ef1f3ea4aa4d8d827a810026575c20e596b4e7..59c6e16535738ba6cbb3224dd4ff5c2987618cdf 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel {
          const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    return UnaryOpUnchangedInferShape(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -121,7 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
                   ops::SeluGradMaker<paddle::framework::OpDesc>,
-                  ops::SeluGradMaker<paddle::imperative::OpBase>);
+                  ops::SeluGradMaker<paddle::imperative::OpBase>,
+                  SeluInferShapeFunctor);
+
 REGISTER_OPERATOR(selu_grad, ops::SeluGradOp);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 6c33ff52044b26b598f835ee40462a01077c1ff8..23c6a0133e1edafba5621825db78a52b88e6947a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
         col_data, paddle::platform::errors::Fatal("XPU memory is not enough"));
 
     if (in_g || filter_g) {
-      int r = xpu::constant<T>(xpu_context, col_data, col_numel, T(0));
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
       bool trans_a = false;
       bool trans_b = true;
       int m = out_g->dims()[0];
@@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
       const T* data_b = filter->data<T>();
       T* data_c = col_data;
 
-      r = xpu::fc_fusion<T, T, T, int32_t>(
+      int r = xpu::fc_fusion<T, T, T, int32_t>(
           xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b,
           nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr,
           xpu::Activation_t::LINEAR);
@@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      xpu::constant<T>(xpu_context, in_g->data<T>(), in_g->numel(), T(0));
 
       int r = xpu::sequence_context_projection_grad<T, int>(
           xpu_context, in_g->data<T>(), col_data, nullptr, lodx, sequence_width,
@@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      xpu::constant<T>(xpu_context, filter_g->data<T>(), filter_g->numel(),
-                       T(0));
 
       int r = xpu::sequence_context_projection<T, int>(
           xpu_context, in->data<T>(), col_data, nullptr, lodx, sequence_width,
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index ec3e04e71faf0b20950d87de1a7f066e2e49310a..513ab46e9b5eebdb39faf4401d9d8b2fc387a82f 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -241,23 +241,8 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
                   ops::SetValueOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value_grad,
-    ops::SetValueGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueGradKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OP_VERSION(set_value)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu
deleted file mode 100644
index f9701b0acaac769bd91bbba156a010c2e05e42c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/set_value_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/set_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, bool>);
-
-REGISTER_OP_CUDA_KERNEL(
-    set_value_grad,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 9dd727959202c6b09bad0f07aa242a8897583342..4696907f32e6d323c31a27cc6959e26f20168503 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -19,14 +19,10 @@
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/slice_utils.h"
-#include "paddle/fluid/operators/strided_slice_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -36,23 +32,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using DDim = framework::DDim;
 
-inline void GetOffsets(const DDim& big_dim, const DDim& small_dim,
-                       DDim start_offset, int cur_dim,
-                       std::vector<DDim>* offsets) {
-  if (cur_dim == big_dim.size()) {
-    offsets->push_back(start_offset);
-    return;
-  }
-  if (small_dim[cur_dim] == big_dim[cur_dim]) {
-    GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
-  } else {
-    for (int i = 0; i < big_dim[cur_dim]; i++) {
-      GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
-      start_offset[cur_dim] += 1;
-    }
-  }
-}
-
 inline std::string GetValueName(framework::proto::VarType::Type data_type) {
   std::string value_name;
   switch (data_type) {
@@ -122,447 +101,5 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       second.to_str(), first.to_str()));
 }
 
-template <typename DeviceContext, typename T>
-class SetValueKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const int rank = ctx.Input<framework::LoDTensor>("Input")->dims().size();
-
-    // TODO(liym27): A more elegent code to do this. C++ has to make template
-    //  integer as constant, but we had better have alternative writing in the
-    //  future.
-    switch (rank) {
-      case 1:
-        SetValueCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.", rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueCompute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::LoDTensor>("Input");
-    auto* value_tensor = ctx.Input<framework::LoDTensor>("ValueTensor");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    auto starts_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<framework::Tensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StepsTensorList");
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-
-    auto in_dims = in->dims();
-    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
-
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-
-    auto place = ctx.GetPlace();
-    auto& eigen_place =
-        *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // Here copy data from input to avoid data loss at PE and Graph level.
-    // TODO(liym27): Speed up in the future version.
-    // - Q: Why don't call ShareDataWith to speed up?
-    // - A: Because it's not supported to ShareDataWith on OP's input and output
-    // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
-    // - Q: Why don't delete Input, after all, the input and output are the same
-    // Tensor at program level?
-    // - A: If deleting Input, the graph will be complex, such as there will
-    // be two ops points to the output in graph: op1 -> output <- set_value.
-    // In this case, we have to find a way to handle the running order of
-    // set_value is what we want.
-    paddle::framework::TensorCopy(*in, place, out);
-
-    Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype());
-    slice_tensor.mutable_data<T>(slice_dims, place);
-    pad_tensor.mutable_data<T>(in_dims, place);
-
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
-    auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
-
-    // Step 1: Set the value of out at `_index` to zero
-    slice_e.device(eigen_place) = slice_e.constant(T(0));
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-    for (size_t i = 0; i < D; ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-      if (starts[i] == ends[i]) {  // slice is empty, data will not be changed
-        return;
-      }
-    }
-
-    out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 2: Set a tensor with the same shape as out tensor. And its data at
-    // '_index' is the same as value_tensor, and data out of '_index' to zero
-
-    // - Step 2.1 Set slice tensor with value
-
-    // NOTE(liym27): [ Why resize slice_tensor here? ]
-    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
-    // slice_tensor should be decreased dims.
-    // e.g.
-    //  x[:,0] = value_tensor
-    // x's shape = [3, 4], value_tensor's shape = [3]
-    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
-    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
-    // shape is [3, 3], which cross the border;
-    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
-    // is [3], which is right.
-
-    slice_tensor.Resize(slice_dims_for_assign);
-    if (value_tensor != nullptr) {
-      CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
-      // ElementwiseComputeEx can do broadcasting
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
-    } else {
-      Tensor value_t(in->dtype());
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-      value_t.mutable_data<T>(value_dims, place);
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
-    }
-    slice_tensor.Resize(slice_dims);
-
-    // - Step 2.2 Pad slice tensor with 0
-    pad_e.device(eigen_place) = pad_e.constant(T(0));
-    pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 3: Set out tensor with value_tensor
-    out_e.device(eigen_place) = out_e - pad_e;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SetValueGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-
-    switch (rank) {
-      case 1:
-        SetValueGradCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueGradCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueGradCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueGradCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueGradCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueGradCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of set_value_grad's input should be less than 7, but "
-            "received %d.",
-            rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueGradCompute(const framework::ExecutionContext& context) const {
-    auto starts = context.Attr<std::vector<int64_t>>("starts");
-    auto ends = context.Attr<std::vector<int64_t>>("ends");
-    auto steps = context.Attr<std::vector<int64_t>>("steps");
-
-    auto axes_int64 = context.Attr<std::vector<int64_t>>("axes");
-    std::vector<int> axes(axes_int64.begin(), axes_int64.end());
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto steps_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto reverse_axis = Eigen::array<bool, D>();
-
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-    auto list_new_steps_tensor =
-        context.MultiInput<framework::Tensor>("StepsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    }
-
-    if (list_new_steps_tensor.size() > 0) {
-      steps = GetDataFromTensorList<int64_t>(list_new_steps_tensor);
-    }
-
-    auto in = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        in->IsInitialized(), true,
-        platform::errors::PermissionDenied(
-            "The input of `set_value_grad`(%s) has not been initialized",
-            framework::GradVarName("Out")));
-    auto grad_value = context.Output<framework::Tensor>(
-        framework::GradVarName("ValueTensor"));
-    auto grad_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    auto in_dims = in->dims();
-
-    auto decrease_axis_int64 =
-        context.Attr<std::vector<int64_t>>("decrease_axes");
-    std::vector<int> decrease_axis(decrease_axis_int64.begin(),
-                                   decrease_axis_int64.end());
-    std::vector<int> infer_flags(axes.size(), 1);
-    std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
-    StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims,
-                        decrease_axis, out_dims_vector.data(), axes.size(),
-                        false);
-
-    framework::DDim out_dims(phi::make_ddim(out_dims_vector));
-
-    std::vector<int> reverse_vector(starts.size(), 0);
-    StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(),
-                        reverse_vector.data(), in_dims, infer_flags,
-                        decrease_axis, starts.size());
-
-    for (size_t axis = 0; axis < D; axis++) {
-      starts_indices[axis] = 0;
-      ends_indices[axis] = out_dims[axis];
-      steps_indices[axis] = 1;
-      reverse_axis[axis] = false;
-    }
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices[axis_index] = starts[axis];
-      ends_indices[axis_index] = ends[axis];
-      steps_indices[axis_index] = steps[axis];
-      reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
-    }
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-    if (grad_input) {
-      // Set gradient of `Input`
-      paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input);
-
-      auto grad_input_t =
-          framework::EigenTensor<T, D, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::From(*grad_input);
-
-      framework::Tensor tmp(grad_input->dtype());
-      tmp.mutable_data<T>(out_dims, context.GetPlace());
-      set_zero(dev_ctx, &tmp, static_cast<T>(0));
-      auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                          Eigen::DenseIndex>::From(tmp);
-
-      grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices)
-          .device(place) = tmp_t;
-    }
-    if (grad_value) {
-      grad_value->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, grad_value, static_cast<T>(0));
-
-      auto in_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                         Eigen::DenseIndex>::From(*in);
-
-      if (grad_value->dims() == out_dims) {
-        auto grad_value_t =
-            framework::EigenTensor<T, D, Eigen::RowMajor,
-                                   Eigen::DenseIndex>::From(*grad_value);
-        if (need_reverse) {
-          framework::Tensor tmp(grad_value->dtype());
-          tmp.mutable_data<T>(out_dims, context.GetPlace());
-          set_zero(dev_ctx, &tmp, static_cast<T>(0));
-          auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                              Eigen::DenseIndex>::From(tmp);
-
-          tmp_t.device(place) =
-              in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-          grad_value_t.device(place) = tmp_t.reverse(reverse_axis);
-        } else {
-          grad_value_t.device(place) =
-              in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-        }
-      } else {
-        int out_dims_size = out_dims.size();
-        auto grad_value_dims = grad_value->dims();
-        auto fake_grad_value_dims = out_dims;
-
-        // Create an extented shape according to the rules of broadcast.
-        auto grad_value_dims_size = grad_value_dims.size();
-
-        int num_decrease = 0;
-
-        int decrease_axis_size = decrease_axis.size();
-        for (int i = 0; i < out_dims_size; i++) {
-          if (decrease_axis.end() !=
-              std::find(decrease_axis.begin(), decrease_axis.end(), i)) {
-            fake_grad_value_dims[i] = 1;
-            num_decrease++;
-          } else if (i < out_dims_size - (grad_value_dims_size +
-                                          decrease_axis_size - num_decrease)) {
-            fake_grad_value_dims[i] = 1;
-          } else {
-            auto index_grad =
-                i - (out_dims_size - (grad_value_dims_size +
-                                      decrease_axis_size - num_decrease));
-            fake_grad_value_dims[i] = grad_value_dims[index_grad];
-
-            PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) ||
-                                  (grad_value_dims[index_grad] == 1),
-                              true,
-                              platform::errors::InvalidArgument(
-                                  "An error occurred while calculating %s: "
-                                  "[%s] can not be accumulated into [%s].",
-                                  framework::GradVarName("ValueTensor"),
-                                  out_dims, grad_value_dims));
-          }
-        }
-
-        VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor")
-                << "([" << grad_value_dims << "])is broadcasted into ["
-                << fake_grad_value_dims << "].";
-
-        auto extent = Eigen::DSizes<Eigen::DenseIndex, D>();
-        auto offset = out_dims;
-        for (int i = 0; i < out_dims_size; i++) {
-          offset[i] = 0;
-          extent[i] = fake_grad_value_dims[i];
-        }
-        std::vector<DDim> offsets;
-        GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets);
-
-        auto grad_value_t =
-            framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::
-                From(*grad_value, fake_grad_value_dims);
-
-        framework::Tensor tmp(grad_value->dtype());
-        tmp.mutable_data<T>(out_dims, context.GetPlace());
-        set_zero(dev_ctx, &tmp, static_cast<T>(0));
-        auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                            Eigen::DenseIndex>::From(tmp);
-
-        tmp_t.device(place) =
-            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
-
-        // accumulate gradient
-        for (auto offset : offsets) {
-          grad_value_t.device(place) =
-              grad_value_t +
-              tmp_t.slice(framework::EigenDim<D>::From(offset), extent);
-        }
-        if (need_reverse) {
-          framework::Tensor tmp_value(grad_value->dtype());
-          tmp_value.mutable_data<T>(fake_grad_value_dims, context.GetPlace());
-          auto tmp_value_t =
-              framework::EigenTensor<T, D, Eigen::RowMajor,
-                                     Eigen::DenseIndex>::From(tmp_value);
-          tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis);
-          grad_value_t.device(place) = tmp_value_t;
-        }
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 599697059c4dcfa54fa728a8ebf88ad95f387774..46d64333b608b7f3e7b3d83664978d162b6d6e52 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(index_indices))
         .AddInput(val_temp)
         .AddOutput(out_temp)
+#if (CANN_VERSION_CODE >= 504001)
+        .AddAttrs({{"use_locking", false}})
+#endif
         .Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 5b7ccdde81097a2cfd74c3d65c0679d277b766a3..e2c8359beb1290f7b1b592c1ff24b15986f41f73 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -95,9 +93,3 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int8_t>, ops::ShapeKernel<uint8_t>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>,
-                       ops::ShapeKernel<plat::complex<float>>,
-                       ops::ShapeKernel<plat::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
deleted file mode 100644
index c6e380a94f84db7de53d0c218682813fcad0128d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-#include "paddle/fluid/platform/complex.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<bool>,
-    paddle::operators::ShapeKernel<int>, paddle::operators::ShapeKernel<int8_t>,
-    paddle::operators::ShapeKernel<uint8_t>,
-    paddle::operators::ShapeKernel<int64_t>,
-    paddle::operators::ShapeKernel<float>,
-    paddle::operators::ShapeKernel<double>,
-    paddle::operators::ShapeKernel<paddle::platform::float16>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<float>>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
deleted file mode 100644
index 39ebcca46a710e0b817792105046af70b6298fc1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shape_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-class ShapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<LoDTensor>().dims();
-    }
-    auto* out_t = ctx.Output<Tensor>("Out");
-    out_t->Resize({in_dims.size()});
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 7bff7b2d668347692309d3695eb46b1fbdb6c7dd..f751ab41014c21fda2403bd69bcd20ad549e40c7 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index 2e9092a643253843ed09ab7475ec3ed723d5e3b8..a62d1b434e76434c3710e45e723060d3f452c91c 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -10,12 +10,41 @@
  *     limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
 
-#include "paddle/fluid/operators/shape_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
+
+template <typename T>
+class ShapeXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel<bool>,
+                       ops::ShapeXPUKernel<int>, ops::ShapeXPUKernel<int64_t>,
+                       ops::ShapeXPUKernel<float>, ops::ShapeXPUKernel<double>);
 
 #endif
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index 54555e494ffe5f2c226c7aabd47b4ce991dab2ec..053a90f2fc9fa2f93c2647c420a046401198bc28 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,27 +23,6 @@ namespace operators {
 class ShardIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 2, "
-                          "but the value given is %d.",
-                          x_dims.size()));
-    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
-      PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U,
-                        platform::errors::InvalidArgument(
-                            "The last dimension of Input(X) should be 1, "
-                            "but the value given is %d.",
-                            x_dims[x_dims.size() - 1]));
-    }
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -114,7 +96,10 @@ Examples:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp,
-                             ops::ShardIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel<int>,
-                       ops::ShardIndexCPUKernel<int64_t>);
+DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor,
+                            PD_INFER_META(phi::ShardIndexInferMeta));
+REGISTER_OPERATOR(
+    shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShardIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
deleted file mode 100644
index 115b3f47d664ba00228343d221d5be70d13a7ff1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-__global__ void ShardIndexInner(const T* in_data, T* out_data,
-                                const int64_t numel, const int index_num,
-                                const int nshards, const int shard_id,
-                                const int ignore_value) {
-  int shard_size = (index_num + nshards - 1) / nshards;
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel) {
-    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
-    if (in_data[idx] / shard_size == shard_id) {
-      out_data[idx] = in_data[idx] % shard_size;
-    } else {
-      out_data[idx] = ignore_value;
-    }
-  }
-}
-
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class ShardIndexCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                          PADDLE_CUDA_NUM_THREADS,
-                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel<int>,
-                        ops::ShardIndexCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h
deleted file mode 100644
index c2fe3711686d4c4c802fadd66d4bc994232ef5ec..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/shard_index_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class ShardIndexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-    PADDLE_ENFORCE_GT(
-        index_num, 0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards, 0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id, 0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id, nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards, shard_id));
-
-    int shard_size = (index_num + nshards - 1) / nshards;
-
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    for (int64_t i = 0; i < numel; ++i) {
-      PADDLE_ENFORCE_GE(in_data[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be "
-                            "greater or equal to 0, but the value given is %d.",
-                            in_data[i]));
-      PADDLE_ENFORCE_LT(in_data[i], index_num,
-                        platform::errors::InvalidArgument(
-                            "The input_index for Op(shard_index) must be less "
-                            "than index_num (%d), but the value given is %d.",
-                            index_num, in_data[i]));
-      if (in_data[i] / shard_size == shard_id) {
-        out_data[i] = in_data[i] % shard_size;
-      } else {
-        out_data[i] = ignore_value;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index dc2e8ad58f31ce8fe845ecb1f368544704e1d9ad..c875448424a24e686b9a6285725f801d604abc46 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/shard_index_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index a4e80343903d5a48dda584dc1f203782adb36787..016ff54645b02e9b3ddfb67595d830ccf5dcfd94 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -12,59 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "SigmoidCrossEntropyWithLogitsOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same rank."
-                          "But received: the rank of Input(X) is [%d], "
-                          "the rank of Input(Label) is [%d].",
-                          rank, labels_dims.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank),
-          phi::slice_ddim(labels_dims, 0, rank),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) shall have the same shape "
-              "except the last dimension. But received: the shape of "
-              "Input(X) is [%s], the shape of Input(Label) is [%s].",
-              x_dims, labels_dims));
-    }
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class SigmoidCrossEntropyWithLogitsGradOp
@@ -200,23 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(
+    sigmoid_cross_entropy_with_logits,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor,
+    PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta));
 REGISTER_OPERATOR(
     sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp,
     ops::SigmoidCrossEntropyWithLogitsOpMaker,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::framework::OpDesc>,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::imperative::OpBase>,
-    ops::SigmoidCrossEntropyWithLogitsInplaceInferer);
+    ops::SigmoidCrossEntropyWithLogitsInplaceInferer,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp,
                   ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             float>,
-    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
-                                             double>);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
deleted file mode 100644
index 40476d5e11f6a3b0cad21038a3f342d824f3575c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#ifdef __HIPCC__
-static constexpr int kNumCUDAThreads = 256;
-#else
-static constexpr int kNumCUDAThreads = 512;
-#endif
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
-                                  const int ignore_index, const int limit,
-                                  T *out_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
-    if ((diff > -eps) && (diff < eps)) {
-      out_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
-    } else {
-      T term1 = (x > 0) ? x : 0;
-      T term2 = x * label;
-      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
-      out_data[i] = term1 - term2 + term3;
-      counts[i] = 1;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
-  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T in = 0;
-  for (int i = threadIdx.x; i < num; i += BlockDim) {
-    in += counts[i];
-  }
-  __syncthreads();
-  auto out =
-      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    T a = out > eps ? out : eps;
-    sum[0] = a;
-  }
-}
-
-template <typename T>
-__global__ void Div(T *loss, const int num, const T *norm) {
-  CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
-}
-
-template <typename T>
-__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
-                                   const int ignore_index, const T *dout_data,
-                                   const int limit, T *dx_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T dout = dout_data[i];
-    T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
-    if ((diff > -eps) && (diff < eps)) {
-      dx_data[i] = static_cast<T>(0.);
-      counts[i] = 0;
-    } else {
-      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
-      T diff = simoid_x - label;
-      dx_data[i] = dout * diff;
-      counts[i] = 1;
-    }
-  }
-}
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.cuda_device_context();
-    bool normalize = context.Attr<bool>("normalize");
-
-    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
-    int limit = Out->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
-    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class GPUSigmoidCrossEntropyWithLogitsGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-
-    auto &dev_ctx = context.cuda_device_context();
-    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T));
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
-
-    int limit = dX->numel();
-    int blocks = NumBlocks(limit);
-    int threads = kNumCUDAThreads;
-    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
-        dx_data, counts);
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          counts, limit, static_cast<T>(1e-5), norm);
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
deleted file mode 100644
index d2ced490ceff474e1e7624c591a9d142b4199c2f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-const int kIgnoreIndex = -100;
-
-// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    Tensor *Out = context.Output<Tensor>("Out");
-    int ignore_index = context.Attr<int>("ignore_index");
-    auto out_data = Out->mutable_data<T>(context.GetPlace());
-    int limit = Out->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        out_data[idx] = static_cast<T>(0.);
-      } else {
-        T term1 = (x > 0) ? x : 0;
-        T term2 = x * label;
-        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
-        out_data[idx] = term1 - term2 + term3;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-// dX = sigmoid(X) - labels
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dX->mutable_data<T>(context.GetPlace());
-
-    int ignore_index = context.Attr<int>("ignore_index");
-    int limit = dX->numel();
-    auto x_data = X->data<T>();
-    auto label_data = Labels->data<T>();
-    auto dout_data = dOut->data<T>();
-    for (int idx = 0; idx < limit; ++idx) {
-      T x = x_data[idx];
-      T label = label_data[idx];
-      T dout = dout_data[idx];
-      if (static_cast<int>(label) == ignore_index) {
-        dx_data[idx] = static_cast<T>(0.);
-      } else {
-        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-        T diff = simoid_x - label;
-        dx_data[idx] = dout * diff;
-      }
-    }
-    bool normalize = context.Attr<bool>("normalize");
-    if (normalize) {
-      int norm = 0;
-      T eps = static_cast<T>(1e-6);
-      for (int idx = 0; idx < limit; ++idx) {
-        T diff = label_data[idx] - static_cast<T>(ignore_index);
-        if ((diff < -eps) || (diff > eps)) {
-          norm += 1;
-        }
-      }
-      eps = static_cast<T>(1e-5);
-      norm = norm > eps ? norm : eps;
-      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 40852425997f0b1a9cfa0c86180f2f2254efceec..f186f95a2b96117fa56fc17f70d4d0884214af87 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
   // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
index 6395aa1caa01b9578d55e1155b0d6cd0d2295e36..c37731580d1212cb47c9e7f18aa4a9ba20af19d8 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
@@ -17,13 +17,15 @@
 #include <memory>
 #include <vector>
 
-#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index e2381c76f7e45a962fcacff079ca67df9610b6f1..ceb42dcf3e592182867a890bdfe73e237913ee53 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index e584c1a4cce1e85344c574526098b034723c3059..84b0f403be03893810ef592db9b2c993cc6b9644 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -44,8 +44,8 @@ Return the number of elements in the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
-                            PT_INFER_META(phi::SizeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
+                            PD_INFER_META(phi::SizeInferMeta));
 REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 374992096605bfef0433992193e54306c3a12858..3840b99dd176d5b348533f3e50f7f90fc3250ea1 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
@@ -23,6 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of SoftmaxOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of SoftmaxOp is not found."));
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto rank_x = dim_x.size();
-    auto axis = ctx->Attrs().Get<int>("axis");
-    PADDLE_ENFORCE_GE(axis, -rank_x,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(X)."));
-    PADDLE_ENFORCE_LT(axis, rank_x,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(X)."));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Out"), true,
-        platform::errors::InvalidArgument("Input(Out) is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")), true,
-        platform::errors::InvalidArgument("Input(Out@GRAD) is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("Out"),
-        ctx->GetInputDim(framework::GradVarName("Out")),
-        platform::errors::InvalidArgument("Input(Out) and its gradients "
-                                          "should have a same shape."));
-
-    ctx->SetOutputDim(framework::GradVarName("X"),
-                      ctx->GetInputDim(framework::GradVarName("Out")));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::SoftmaxInferMeta));
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   ops::SoftmaxOpInferVarType,
                   ops::SoftmaxOpGradMaker<paddle::framework::OpDesc>,
                   ops::SoftmaxOpGradMaker<paddle::imperative::OpBase>,
-                  ops::SoftmaxInplaceInferer);
-REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
+                  ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad,
+                  SoftmaxGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 3bc55fafd81e18d0a986268ff4692129c6515edc..3148b31a8322e2bab39ad7f723ee59a6db64c204 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index 2bc5124843c38152d2f5d3ffcef5a5ca24534bfd..a60ec5a4df52b8275a17185a63c8a7d27dd8132b 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -23,9 +23,9 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
 #endif
@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
       std::vector<int> pads(rank * 2, 0);
       pads[axes.back() * 2 + 1] = zero_length;
 
-      paddle::operators::math::PaddingFunctor<DeviceContext, C>(
-          rank, ctx, pads, static_cast<C>(0), *dy, &full_dy);
+      phi::funcs::PaddingFunctor<DeviceContext, C>(
+          rank, ctx.template device_context<DeviceContext>(), pads,
+          static_cast<C>(0), *dy, &full_dy);
       fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
                    !forward);
     }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 6678320f9ffa61e3e6c51fd806569c2571d63d69..5b8922505cc089d66f0b444fc65ccec8ed051876 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -26,6 +26,52 @@ class SplitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of SplitOp should not be null."));
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Outputs(Out) of SplitOp should not be empty."));
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+
+    if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(
+          sections.size(), outs_number,
+          platform::errors::InvalidArgument("tensor split sections size "
+                                            "should be equal to output size."));
+    }
+
+    if (ctx->HasInput("AxisTensor")) {
+      auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
+      std::vector<framework::DDim> outs_dims(outs_number, out_dims);
+      ctx->SetOutputsDim("Out", outs_dims);
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+      return;
+    }
+
+    bool each_section_is_known =
+        (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
+
+    auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
+                                    in_dims, num, sections, axis, outs_number);
+    ctx->SetOutputsDim("Out", outs_dims);
+    if (axis != 0) {
+      // Only pass LoD when not spliting along the first dim.
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+    }
+  }
+
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -125,10 +171,6 @@ Example:
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
-                            PT_INFER_META(phi::SplitInferMeta));
-
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
                   ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>,
-                  SplitInferShapeFunctor);
+                  ops::SplitGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index bff8061814ae66f243ca9d863cf866821ede4a32..aa944cfcfbb1713aeb27b501083853abb4ffed40 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -16,9 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel<T> {
       out_level.mutable_data<T>(output_shape, context.GetPlace());
       // pooling
       if (pooling_type == "max") {
-        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
-        math::MaxPool<T> max_process;
+        phi::funcs::Pool2dFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::MaxPool<T>, T>
+            pool_forward;
+        phi::funcs::MaxPool<T> max_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
                      kernel_size, strides, paddings, true, false, &out_level,
                      max_process);
       } else if (pooling_type == "avg") {
-        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
-        math::AvgPool<T> avg_process;
+        phi::funcs::Pool2dFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::AvgPool<T>, T>
+            pool_forward;
+        phi::funcs::AvgPool<T> avg_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
                      kernel_size, strides, paddings, true, false, &out_level,
                      avg_process);
@@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel<T> {
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
     auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
+    phi::funcs::SetConstant<
+        typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T>
+        zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
     auto out_stride = phi::stride(out->dims());
@@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel<T> {
       outgrad_level.Resize(out_shape);
       // pooling backward
       if (pooling_type == "max") {
-        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        phi::funcs::MaxPool2dGradFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE, T>
+            pool2d_backward;
         pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
                         *&out_level, *&outgrad_level, kernel_size, strides,
                         paddings, in_x_grad);
       } else if (pooling_type == "avg") {
-        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+        phi::funcs::Pool2dGradFunctor<
+            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
+            phi::funcs::AvgPoolGrad<T>, T>
             pool_backward;
-        math::AvgPoolGrad<T> avg_process;
+        phi::funcs::AvgPoolGrad<T> avg_process;
         pool_backward(context.template device_context<DeviceContext>(), *in_x,
                       *&out_level, *&outgrad_level, kernel_size, strides,
                       paddings, true, false, in_x_grad, avg_process);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index 58e5440689926497705624a0c64e6cc3d43dbab1..a776a78616b8d6dbac66ccab0d59433b98ae65e4 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 956544c53609eb29326dc5cf295d978d767ac176..d61f5aa3f634cd2aee1e5c2f34f4467b1697e455 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index c92d468f3462c92cd0631383996012afb6edb46b..af29aac6b9052877283271abc12f4dc1da6b8a3e 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
       memory::Copy(npu_place, dst + i * dst_after, npu_place,
                    src + i * src_after, sizeof(T) * size, npu_ctx.stream());
+#elif defined(PADDLE_WITH_MLU)
+      auto& mlu_place = place;
+      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
+      memory::Copy(mlu_place, dst + i * dst_after, mlu_place,
+                   src + i * src_after, sizeof(T) * size, mlu_ctx.stream());
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "Paddle is not compiled with GPU."));
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
index 664f1031915e4661769d9b2844c5388f0efa91c0..fa8a5e92712ec86a01ca01b7eb644e289c03000a 100644
--- a/paddle/fluid/operators/take_along_axis_op.cc
+++ b/paddle/fluid/operators/take_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/take_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp,
                   ops::TakeAlongAxisGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel<float>,
-                       ops::TakeAlongAxisOpKernel<double>,
-                       ops::TakeAlongAxisOpKernel<int>,
-                       ops::TakeAlongAxisOpKernel<uint8_t>,
-                       ops::TakeAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis_grad,
-                       ops::TakeAlongAxisGradOpKernel<float>,
-                       ops::TakeAlongAxisGradOpKernel<double>,
-                       ops::TakeAlongAxisGradOpKernel<int>,
-                       ops::TakeAlongAxisGradOpKernel<uint8_t>,
-                       ops::TakeAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
deleted file mode 100644
index b6c62d497b379dda568f661b31366914e6870a7c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/take_along_axis_op.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/take_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TakeAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel<float>,
-                        ops::TakeAlongAxisCUDAKernel<double>,
-                        ops::TakeAlongAxisCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisCUDAKernel<int>,
-                        ops::TakeAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(take_along_axis_grad,
-                        ops::TakeAlongAxisGradOpCUDAKernel<float>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<double>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
deleted file mode 100644
index fc781dbddf2ad25de3728e76d231d0164d46c08e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/take_along_axis_op.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class TakeAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index e05b4de65214c8cf55d099fccc7c18370b2312b7..0a71875d8931ef80846aa7e0c95ce1beab86fd7c 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -79,6 +79,28 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
           model_input_shape_str, runtime_input_shape_str));
 }
 
+static paddle::experimental::DataType TRT2FluidDataType(
+    nvinfer1::DataType type) {
+  switch (type) {
+    case nvinfer1::DataType::kFLOAT:
+      return paddle::experimental::DataType::FLOAT32;
+    case nvinfer1::DataType::kINT32:
+      return paddle::experimental::DataType::INT32;
+    case nvinfer1::DataType::kHALF:
+      return paddle::experimental::DataType::FLOAT16;
+    case nvinfer1::DataType::kINT8:
+      return paddle::experimental::DataType::INT8;
+#if IS_TRT_VERSION_GE(7000)
+    case nvinfer1::DataType::kBOOL:
+      return paddle::experimental::DataType::BOOL;
+#endif
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unknown fluid datatype in Fluid op converter"));
+      return paddle::experimental::DataType::FLOAT32;
+  }
+}
+
 static void RuntimeDynamicShapeCheck(
     const std::string &x, const std::vector<int32_t> &runtime_input_shape,
     const std::vector<int32_t> &min_input_shape,
@@ -520,9 +542,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
         buffers[bind_index] = static_cast<void *>(t.data<int64_t>());
       } else if (type == framework::proto::VarType::INT32) {
         buffers[bind_index] = static_cast<void *>(t.data<int32_t>());
+      } else if (type == framework::proto::VarType::FP16) {
+        buffers[bind_index] = static_cast<void *>(t.data<float16>());
       } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "The TRT Engine OP only support float/int32_t/int64_t input."));
+        PADDLE_THROW(
+            platform::errors::Fatal("The TRT Engine OP only support "
+                                    "float/int32_t/int64_t/float16 input."));
       }
     }
 
@@ -570,9 +595,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
                             "than the number of bindings, but got binding "
                             "index = %d, number of bindings = %d.",
                             bind_index, num_bindings));
-      buffers[bind_index] =
-          static_cast<void *>(fluid_t->mutable_data<float>(dev_place));
-
+      auto trt_type = engine->engine()->getBindingDataType(bind_index);
+      // get adr and set type
+      buffers[bind_index] = static_cast<void *>(
+          fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type)));
       output_index += 1;
     }
 
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f..1de1b590a1311b81f16ba05e746402e1fc14c556 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/phi/core/ddim.h"
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(softmax);
 
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index dc12f8e8892a022c6f55f4fe3a6237a7a01fa290..e179149c5bb77bd642f744be48109a941c66febf 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile");
-    auto x_dims = ctx->GetInputDim("X");
-    auto repeat_times = ctx->Attrs().Get<std::vector<int>>("repeat_times");
-    if (repeat_times.size() == 0) {
-      repeat_times = std::vector<int>(x_dims.size(), -1);
-    }
-
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        repeat_times.size(), MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must not be greater than %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times.size()));
-    PADDLE_ENFORCE_GE(
-        repeat_times.size(), 1,
-        platform::errors::InvalidArgument(
-            "The size of the shape of input 'repeat_times' for tile op "
-            "must be positive integers, but the value received is %d.",
-            repeat_times.size()));
-
-    auto out_rank =
-        std::max(static_cast<size_t>(x_dims.size()), repeat_times.size());
-    std::vector<int64_t> out_shape(out_rank);
-    auto x_dim_vec = phi::vectorize<int>(x_dims);
-    if (x_dim_vec.size() > repeat_times.size()) {
-      auto diff = x_dim_vec.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, -1);
-    } else {
-      auto diff = repeat_times.size() - x_dim_vec.size();
-      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
-    }
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      if (x_dim_vec[i] == -1 || repeat_times[i] == -1) {
-        out_shape[i] = -1;
-      } else {
-        PADDLE_ENFORCE_GT(
-            repeat_times[i], 0,
-            platform::errors::InvalidArgument(
-                "Every element of the input 'repeat_times' for tile op must be "
-                "greater than 0, but the value given is %d.",
-                repeat_times[i]));
-        out_shape[i] = x_dim_vec[i] * repeat_times[i];
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
-    if (out_shape[0] == x_dims[0]) {
-      ctx->ShareLoD("X", "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor,
+                            PD_INFER_META(phi::TileInferMeta));
+
 REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TileGradOpMaker<paddle::imperative::OpBase>);
+                  ops::TileGradOpMaker<paddle::imperative::OpBase>,
+                  TileInferMetaFunctor);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
                   ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-#endif
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
deleted file mode 100644
index 1698b5e3c6322e2cd9cbe7cf4839e2fc08627b32..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tile_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-inline std::vector<int> get_repeat_times(
-    const framework::ExecutionContext& ctx) {
-  if (ctx.HasInput("RepeatTimes")) {
-    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
-    auto* repeat_data = repeat_tensor->data<int>();
-    framework::Tensor cpu_repeat_tensor;
-    if (platform::is_gpu_place(repeat_tensor->place()) ||
-        platform::is_xpu_place(repeat_tensor->place()) ||
-        platform::is_npu_place(repeat_tensor->place())) {
-      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
-                                        &cpu_repeat_tensor);
-      repeat_data = cpu_repeat_tensor.data<int>();
-    }
-    auto vec_repeat_times =
-        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
-    return vec_repeat_times;
-  }
-
-  auto list_repeat_times_tensor =
-      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
-  if (list_repeat_times_tensor.size() > 0) {
-    // get tensor from
-    std::vector<int> vec_repeat_times;
-    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
-      auto tensor = list_repeat_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place()) ||
-          platform::is_xpu_place(tensor->place()) ||
-          platform::is_npu_place(tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_repeat_times.push_back(*temp.data<int32_t>());
-      } else {
-        vec_repeat_times.push_back(*tensor->data<int32_t>());
-      }
-    }
-    return vec_repeat_times;
-  } else {
-    return ctx.Attr<std::vector<int>>("repeat_times");
-  }
-}
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::To32BitIndex;
-
-template <typename DeviceContext, typename T>
-class TileKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
-    PADDLE_ENFORCE_LE(
-        rank, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size, 1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, repeat_times_size));
-    rank = std::max(rank, repeat_times_size);
-    switch (rank) {
-      case 1:
-        Tile<1>(context);
-        break;
-      case 2:
-        Tile<2>(context);
-        break;
-      case 3:
-        Tile<3>(context);
-        break;
-      case 4:
-        Tile<4>(context);
-        break;
-      case 5:
-        Tile<5>(context);
-        break;
-      case 6:
-        Tile<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto repeat_times = get_repeat_times(context);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i], 0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(), vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(), repeat_times.size()));
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    // use 32-bit index to speed up
-    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
-    if (use_32bit_index) {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
-    } else {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                   bcast_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TileGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto repeat_times = get_repeat_times(context);
-    auto x_dims = x->dims();
-    auto vec_in_dims = phi::vectorize<int>(x_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    // 1. reshape_dims_vec is the broadcast parameter.
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-      dx->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*dout, context.GetPlace(), context.device_context(),
-                            dx);
-      // TensorCopy may change the dims of dx
-      dx->Resize(x_dims);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "Th rank of the input 'Out@GRAD' for tile_grad op "
-                            " must be greater than or equal to 1, but "
-                            "the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for tile_grad op "
-                            "must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void TileBackward(const framework::ExecutionContext& context,
-                    const std::vector<int>& reshape_dims_vec,
-                    const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..95bfb9f4e1a9d374c66997567f5d80df8b5d8701
--- /dev/null
+++ b/paddle/fluid/operators/tile_op_functor.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace paddle {
+namespace operators {
+
+inline std::vector<int> get_repeat_times(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
+    auto* repeat_data = repeat_tensor->data<int>();
+    framework::Tensor cpu_repeat_tensor;
+    if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_xpu_place(repeat_tensor->place()) ||
+        platform::is_npu_place(repeat_tensor->place())) {
+      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
+                                        &cpu_repeat_tensor);
+      repeat_data = cpu_repeat_tensor.data<int>();
+    }
+    auto vec_repeat_times =
+        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
+    return vec_repeat_times;
+  }
+
+  auto list_repeat_times_tensor =
+      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+  if (list_repeat_times_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_repeat_times;
+    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
+      auto tensor = list_repeat_times_tensor[i];
+      if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_xpu_place(tensor->place()) ||
+          platform::is_npu_place(tensor->place())) {
+        framework::Tensor temp;
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_repeat_times.push_back(*temp.data<int32_t>());
+      } else {
+        vec_repeat_times.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_repeat_times;
+  } else {
+    return ctx.Attr<std::vector<int>>("repeat_times");
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 9e306c7be537bc7403812f4907541e1a9671c12a..cea6b458aec782923722cb37fe41c1c4d59292e5 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
index 6b60b167a2465fcb03d8ec088cfa288f9fb14af1..598377587d6f73e0c21abbc4d3819d16eacb1f23 100644
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -11,11 +11,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/tile_op_functor.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class TileXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index d60976928e00cb5ecfde6ca65e0a1b0d5b1ef938..80c9935057cb5d5809fde545bdd0772afdaf2702 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -51,6 +51,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
 struct SegmentOffsetIter {
   EIGEN_DEVICE_FUNC
   explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 810afc901df57bfa3c518b2363fb9153ee353762..d1add111e1d24cb711955a9aff06eb19feb35dc9 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
-
-REGISTER_OP_CPU_KERNEL(top_k_v2,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
-
-REGISTER_OP_CPU_KERNEL(
-    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
deleted file mode 100644
index 84d8eef53bf72c5dbd5404a889925541374c9823..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-template <typename DeviceContext, typename T>
-class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-
-    // get the attributes
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      Tensor k_host;
-      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
-      k = k_host.data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    const auto& out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      // if get the topK from the last axis
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
-                        indices, largest)) {
-          // Successed, return.
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      // NOTE: pass lds and dim same to input width.
-      // NOTE: old matrix implementation of stride is different to eigen.
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-    } else {
-      // if get topK not from the last axis, will tranpose the tensor and get
-      // TopK
-
-      // first step, prepare the trans args for the tranpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-      // second step, tranpose the input
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      // third step, calcluate the topk
-      // allocate the tmp cuda memory for the tmp result
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      Tensor trans_out;
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                        &trans_out, &trans_ind, largest)) {
-          // last step, tranpose back the indices and output
-          TransCompute<platform::CUDADeviceContext, int64_t>(
-              ndims, dev_ctx, trans_ind, indices, trans);
-          TransCompute<platform::CUDADeviceContext, T>(
-              ndims, dev_ctx, trans_out, output, trans);
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-    }
-  }
-};
-
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-template <typename DeviceContext, typename T>
-class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // get the real the axis and the k
-    if (axis < 0) axis += in_dims.size();
-    const int& k = out_dims[axis];
-    const int& raw_height = in_dims[axis];
-
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    auto ComputeBlockSize = [](int col) {
-      if (col > 512)
-        return 1024;
-      else if (col > 256 && col <= 512)
-        return 512;
-      else if (col > 128 && col <= 256)
-        return 256;
-      else if (col > 64 && col <= 128)
-        return 128;
-      else
-        return 64;
-    };
-    int block_size = ComputeBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-
-    // lanuch the cuda kernel to assign the grad
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          double>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
-                       paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int64_t>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
deleted file mode 100644
index a808207476f3b9be2636741d7b0ac06002ccba08..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-  The reason why we need the topk v2 is because the compatibility. We redefine
-  the NaN is maximum value
-  in the process of comparing. If do not add the topk v2,  will affect the
-  inference result of model that traing
-  by the older version paddlepaddle.
-*/
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopK(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     const int& k, const bool& largest, const bool& sorted) {
-  // when the k is small, will the partial sort
-  bool partial_sort_flag = (k * 64) < input_width;
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            if (largest) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            } else {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            }
-          });
-    } else {
-      // use the nth-element to get the K-larger or K-small element
-      if (largest) {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
-                    [&largest](const std::pair<T, Type>& l,
-                               const std::pair<T, Type>& r) {
-                      return (std::isnan(static_cast<double>(l.first)) &&
-                              !std::isnan(static_cast<double>(r.first))) ||
-                             (l.first > r.first);
-                    });
-        }
-      } else {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(
-              col_vec.begin(), col_vec.begin() + k - 1,
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-        }
-      }
-    }
-    for (Type j = 0; j < k; ++j) {
-      t_out[i * k + j] = col_vec[j].first;
-      t_indices[i * k + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopKAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data,
-                           const int& k) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TopkV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Get the top k elements of each row of input tensor
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto* indices = context.Output<Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    // if K tensor is not null, will the use K tesnor as k
-    auto* k_t = context.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      // accroding to axis to set K value in the dim
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    const auto& out_dims = output->dims();
-    if (axis + 1 == in_dims.size()) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           output_data, indices_data, k, largest, sorted);
-    } else {
-      // if the topk dims is not last dim, will tranpose and do topk
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      // Allocate the temp tensor to the save the topk indices, values
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      Tensor tmp_indices;
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      // get the TopK value
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, k, largest, sorted);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TopkV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    const size_t& k = out_dims[axis];
-
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis + 1 == in_dims.size()) {
-      // allocate the memory for the input_grad
-
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
-                     indices, x_grad_data, k);
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      // transpose the out_grad, indices
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, *indices, &trans_ind, trans);
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out, k);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 5b8a6b3e75449508afa5d316d81f97ab815c9ea9..caaae02124c926b9e4be08744e4192dab20ca5d0 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index e11070638834c46a6628d652216e1ddddeb2487d..dff5c2d3f39378486bb5d2f8010d005d57b20550 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc
index 49daac2ff0da63c542a807dc97925c6989559f14..4d9c39be92eff029e66cdde900318b045c2b531f 100644
--- a/paddle/fluid/operators/top_k_v2_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 63b914a31a86aef48e952a4877c7beb670075cc4..c6c0fa3c0019eac742a9c70ea53a438f5a474895 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2.
 )DOC");
   }
 };
-class TraceOpGrad : public framework::OperatorWithKernel {
+class TraceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -107,14 +107,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
-                            PT_INFER_META(phi::TraceInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
+                            PD_INFER_META(phi::TraceInferMeta));
 REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::framework::OpDesc>,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
                   TraceInferShapeFunctor);
 
-REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad,
+REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 5617d728a51dc1c5e21053a2af05d062ecc1a22b..fb39034c8e92c1ac39aa1ca6e57d5a08ca1ca9d6 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index 9233917b0931b98d30b736ec9b69fd68c0604d18..35b925ca172b7ccb665978010dbcdd2cb10c9678 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/triangular_solve_op.h"
-#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,58 +23,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto x_dims_n = x_dims.size();
-    auto y_dims_n = y_dims.size();
-
-    PADDLE_ENFORCE_GE(
-        x_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor X's dimensions of TriangularSolveOp "
-                         "should be >= 2. But received X's "
-                         "dimensions = %d, X's shape = [%s]",
-                         x_dims.size(), x_dims));
-
-    PADDLE_ENFORCE_GE(
-        y_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor Y's dimensions of TriangularSolveOp "
-                         "should be >=2. But received Y's "
-                         "dimensions = %d, Y's shape = [%s]",
-                         y_dims.size(), y_dims));
-
-    PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
-
-    std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-    std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
-
-    std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
-                                        x_dims_vec.end() - 2);
-    std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
-                                        y_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
-
-    std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
-    y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2],
-                                                     y_dims_vec[y_dims_n - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
     return framework::OpKernelType(
@@ -168,20 +117,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor,
+                            PD_INFER_META(phi::TriangularSolveInferMeta));
+
 REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp,
                   ops::TriangularSolveOpMaker,
                   ops::TriangularSolveOpInferVarType,
                   ops::TriangularSolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>,
+                  TriangularSolveInferShapeFunctor);
 
 REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
deleted file mode 100644
index 7df98517e8418905f0f8c8ce603762967a8b5f38..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
deleted file mode 100644
index 4e68add096ff28f5378b02689248c3957c1e8ae9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-static void triangular_solve(const DeviceContext& context, const Tensor& x,
-                             const Tensor& y, Tensor* out, bool upper,
-                             bool transpose, bool unitriangular) {
-  // Tensor broadcast use eigen
-  std::vector<int64_t> x_bst_dims_vec;
-  std::vector<int64_t> y_bst_dims_vec;
-  std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
-
-  Tensor x_bst(x.type());
-  TensorExpand<T, DeviceContext>(context, x, &x_bst, x_bst_dims_vec);
-
-  Tensor y_bst(y.type());
-  TensorExpand<T, DeviceContext>(context, y, &y_bst, y_bst_dims_vec);
-
-  // TriangularSolveFunctor performs calculations in-place
-  // x_clone should be a copy of 'x' after broadcast
-  // out should be a copy of 'y' after broadcast
-  Tensor x_clone(x.type());
-  x_clone.Resize(phi::make_ddim(x_bst_dims_vec));
-  x_clone.mutable_data<T>(context.GetPlace());
-  framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone);
-
-  out->Resize(phi::make_ddim(y_bst_dims_vec));
-  out->mutable_data<T>(context.GetPlace());
-  framework::TensorCopy(y_bst, context.GetPlace(), context, out);
-
-  math::TriangularSolveFunctor<DeviceContext, T> functor;
-  functor(context, &x_clone, out, /*left=*/true, upper, transpose,
-          unitriangular);
-}
-
-template <typename DeviceContext, typename T>
-class MatrixReduceSumFunctor {
- public:
-  void operator()(const Tensor& input, Tensor* output,
-                  const framework::ExecutionContext& ctx);
-};
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-    out->Resize(phi::make_ddim(out_bst_dims));
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-
-    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
-        &in, out, out_reduce_dims, true, false, ctx)
-        .template apply<T>();
-    out->Resize(phi::make_ddim(out_dims));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TriangularSolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    triangular_solve<DeviceContext, T>(dev_ctx, *x, *y, out, upper, transpose,
-                                       unitriangular);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TriangularSolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    const auto* out = ctx.Input<framework::Tensor>("Out");
-    const auto* dout =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    std::vector<int64_t> x_bst_dims_vec;
-    std::vector<int64_t> y_bst_dims_vec;
-    std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y);
-
-    Tensor dy_bst(y->type());
-    if (dy) {
-      dy->mutable_data<T>(y->dims(), dev_ctx.GetPlace());
-      dy_bst.Resize(phi::make_ddim(y_bst_dims_vec));
-      dy_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate x's conjugate for complex
-      Tensor x_conj(x->type());
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, x->numel());
-      phi::funcs::ConjFunctor<T> x_functor(
-          x->data<T>(), x->numel(),
-          x_conj.mutable_data<T>(x->dims(), dev_ctx.GetPlace()));
-      x_for_range(x_functor);
-
-      // reuse forward to get dy_bst, and the result has been broadcated.
-      triangular_solve<DeviceContext, T>(dev_ctx, x_conj, *dout, &dy_bst, upper,
-                                         !transpose, unitriangular);
-
-      if (dy_bst.dims() == dy->dims()) {
-        framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dy_bst, dy, ctx);
-        dy->Resize(y->dims());
-      }
-    }
-
-    Tensor dx_bst(x->type());
-    if (dx) {
-      dx->mutable_data<T>(x->dims(), dev_ctx.GetPlace());
-      dx_bst.Resize(phi::make_ddim(x_bst_dims_vec));
-      dx_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate out's conjugate for complex
-      Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-      if (transpose) {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
-        blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      } else {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
-        blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      }
-
-      Tensor dx_bst_upper(x->type());
-      // get upper or lower triangular
-      dx_bst_upper.Resize(dx_bst.dims());
-      dx_bst_upper.mutable_data<T>(dev_ctx.GetPlace());
-
-      const auto& dims = dx_bst.dims();
-      const auto H = dims[dims.size() - 2];
-      const auto W = dims[dims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, dx_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(dx_bst.data<T>(), unitriangular,
-                                            !upper, H, W,
-                                            dx_bst_upper.data<T>());
-      x_for_range(tril_triu_computer);
-
-      if (dx_bst_upper.dims() == dx->dims()) {
-        framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dx_bst_upper, dx, ctx);
-        dx->Resize(x->dims());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb
--- /dev/null
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  Licensed under
+the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TrilTriuXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::Tensor>("X");
+    const auto* x_data = x->data<T>();
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+
+    const int diagonal = context.Attr<int>("diagonal");
+    const bool lower = context.Attr<bool>("lower");
+    auto xshape = phi::vectorize<int>(x->dims());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    if (lower) {
+      r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+    } else {
+      r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    tril_triu, ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc
index 54f4deac80a74e2e471036c2e25d08a582e29a9d..b77775f5a8c094fc7aa05f2f017834681424207f 100644
--- a/paddle/fluid/operators/trunc_op.cc
+++ b/paddle/fluid/operators/trunc_op.cc
@@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker,
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 6eb7f922dfdbec41aa1c47d11e1decc259d08689..dc5a66dce16d698f9cfac01e3bdc776d08c2af19 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -17,8 +17,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound(
-            "Output(Out) of TruncatedGaussianRandomOp should not be null."));
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> out_dim;
-    out_dim.reserve(shape.size());
-    for (auto dim : shape) {
-      out_dim.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE_GT(
-        shape.size(), 0UL,
-        platform::errors::InvalidArgument(
-            "the input shape of TruncatedGaussianRandomOp must be set, "
-            "But the rank of shape we received is %d",
-            shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dim));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
-                             ops::TruncatedGaussianRandomOp,
-                             ops::TruncatedGaussianRandomOpMaker);
+
+DECLARE_INFER_SHAPE_FUNCTOR(
+    truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor,
+    PD_INFER_META(phi::TruncatedGaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    truncated_gaussian_random, ops::TruncatedGaussianRandomOp,
+    ops::TruncatedGaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TruncatedGaussianRandomInferShapeFunctor);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h
index a6ff2f686cb76bb03de8074014f82d6ff9e57bd3..8af6e281424eaabd8d6ea86843b3c13aa36cba47 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.h
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -140,19 +137,9 @@ T Erfinv(T x) {
 template <typename T>
 struct TruncatedNormal {
   T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
-    auto normal_cdf = [](T x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf(-2.0);
-    b_normal_cdf = normal_cdf(2.0);
-  }
-
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {}
   T operator()(T value) const {
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
+    return std::sqrt(2.0) * Erfinv(value) * std + mean;
   }
 };
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 261d9cee2d5cd25c510aacb280b9623f985eb1f7..4ed0dd22ec086923bbe47af192cab8d001ae734f 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -84,8 +84,13 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     Tensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
+    auto normal_cdf = [](float x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
+    float b_normal_cdf = normal_cdf((2.0 - mean) / std);
+    std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
+                                               2.0 * b_normal_cdf - 1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
index 803b61fbe813f85f48b71d1de7fc41eb26e4b8da..984d9f397cc655b4cfd7e0bc211db1665252272f 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -32,8 +32,13 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
+    auto normal_cdf = [](float x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
+    float b_normal_cdf = normal_cdf((2.0 - mean) / std);
+    std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
+                                               2.0 * b_normal_cdf - 1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
 
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index c45b839d5b40bd1d0db25743406bb8cc319f1280..02fed3de6cef74f19a5dd4d8500017e6097a56a4 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
-                            PT_INFER_META(phi::UnfoldInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
+                            PD_INFER_META(phi::UnfoldInferMeta));
 REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
                   ops::UnfoldGradMaker<paddle::framework::OpDesc>,
                   ops::UnfoldGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index a864c48ad757411861b6d2b3be40361c347601f8..b941dc21c3ab213e5abc2c4c908413b2b6222c41 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -25,8 +25,9 @@ DECLARE_bool(use_curand);
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #endif
 
 namespace paddle {
@@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context,
   if (gen_cuda->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
       using MT = typename details::MPTypeTrait<T>::Type;
-      distribution::uniform_distribution<MT> dist;
-      distribution::uniform_transform<MT> trans(min, max);
-      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+      phi::funcs::uniform_distribution<MT> dist;
+      phi::funcs::uniform_real_transform<MT> trans(min, max);
+      phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
     } else {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
       auto func =
           UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
                                     diag_step, diag_val, gen_offset);
-      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
     }
   } else {
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
-    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+    phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
   }
 }
 #endif
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 5ab2004617810b34276632fa487e8f12d7b3b915..1be8f3387dbad85e0dce3593ad61b9c116b10ef0 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -236,7 +236,6 @@ register_unity_group(cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
-    segment_pool_op.cc
     select_input_op.cc
     select_output_op.cc)
 register_unity_group(cc
@@ -496,8 +495,7 @@ register_unity_group(cu
     scale_op.cu
     scatter_nd_add_op.cu
     scatter_op.cu
-    seed_op.cu
-    segment_pool_op.cu)
+    seed_op.cu)
 register_unity_group(cu
     roi_pool_op.cu
     selu_op.cu
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index 7f676cbb65ee460cdf639641330d49b5774f95a5..f6112fb59c12252255861825ff9d7b534c542665 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 3e11c952d15f3460f987f6fa2cb28970f97cc96b..a8ced783744a961eb8ce64983de7e9615763c1b6 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc
index bf1cdeed65a8427c19410347209faa099673cb7c..602376d54e0d2a49b6cf4f6a78d332154c188a7e 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cc
+++ b/paddle/fluid/operators/viterbi_decode_op.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input in ViterbiDecode  must be 3. But "
-                          "received Input's rank is %d.",
-                          in_dims.size()));
-    auto length_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(length_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The rank of Length in ViterbiDecode must be 1. But "
-                          "received Length's rank is %d.",
-                          length_dims.size()));
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(
-        transition_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of Transition in ViterbiDecode must be 2. But "
-            "received Transition's rank is %d.",
-            transition_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0], length_dims[0],
-          platform::errors::InvalidArgument(
-              "The batch size of Input and Length should be equal."));
-      PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The number of tags of Input (%d) and Transition "
-                            "(%d) should be equal.",
-                            transition_dims[0], in_dims[2]));
-    }
-    ctx->SetOutputDim("Scores", length_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 namespace platform = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor,
+                            PD_INFER_META(phi::ViterbiDecodeInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp,
-                             ops::ViterbiDecodeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    viterbi_decode, ops::ViterbiDecodeKernel<platform::CPUDeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CPUDeviceContext, double>);
+                             ops::ViterbiDecodeOpMaker,
+                             ViterbiDecodeInferShapeFunctor);
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
deleted file mode 100644
index 68628fb2748c424996e7f0ae24594ff04649f8d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/viterbi_decode_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-int64_t ComputeBlockSize(int64_t col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256)
-    return 512;
-  else if (col > 128)
-    return 256;
-  else if (col > 64)
-    return 128;
-  else if (col > 32)
-    return 64;
-  else if (col > 16)
-    return 32;
-  else if (col > 8)
-    return 16;
-  else
-    return 8;
-}
-
-template <template <typename T> typename BinaryFunctor, typename T>
-struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* output) {
-    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
-    std::vector<framework::Tensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, -1,
-                                                      BinaryFunctor<T>());
-  }
-};
-
-template <template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
-    std::vector<framework::Tensor*> outs = {mask};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
-  }
-};
-
-template <typename T, typename IndType, size_t BlockDim>
-__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
-                                 const int64_t width,      // c
-                                 const int64_t post_size,  // h
-                                 const T* in, IndType* out_idx, T* out) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  cub::ArgMax reducer;
-  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    cub::KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      // return max, argmax
-      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
-      if (out != nullptr) out[idx] = kv_pair.value;
-    }
-    __syncthreads();
-  }
-}
-
-__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int start = idx; idx < num; idx += gridDim.x) {
-    data[idx] = idx * scale;
-  }
-}
-
-template <>
-struct ARange<platform::CUDADeviceContext> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, int64_t* data,
-                  int num, int64_t scale) {
-    int64_t kBlockDim = ComputeBlockSize(num);
-    // kBlockDim > num at most of time, so we can set grid = 1
-    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
-  }
-};
-
-template <typename T, typename IndType>
-struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t numel = input.numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    const auto& dev_ctx = ctx.cuda_device_context();
-    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-    int64_t height = pre * post;
-    int64_t width = n;
-    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, in_data, out_idx_data, out_data));
-    }
-  }
-};
-
-template <typename T>
-struct GetMaxValue<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& input, T* max_value) {
-    framework::Tensor out_data;
-    out_data.Resize(phi::make_ddim({1}));
-    out_data.mutable_data<T>(platform::CUDAPlace());
-    switch (ComputeBlockSize(input.numel())) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1, input.numel(), 1, input.data<int64_t>(), nullptr,
-              out_data.data<int64_t>()));
-    }
-    framework::Tensor max_value_tensor;
-    framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
-    *max_value = max_value_tensor.data<T>()[0];
-  }
-};
-
-template <typename T, typename IndexT>
-struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& src, const framework::Tensor& index,
-                  framework::Tensor* output) {
-    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    viterbi_decode,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
deleted file mode 100644
index 0974177e6c736bbbf1fc7ce709e336561d04a3e3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename IndType>
-struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    int64_t height = pre * post;
-    int64_t width = n;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-// Reduce
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < height; ++i) {
-      int64_t h = i / post;
-      int64_t w = i % post;
-      IndType max_idx = -1;
-      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
-      for (int64_t j = 0; j < width; ++j) {
-        if (in_data[h * width * post + j * post + w] > max_value) {
-          max_value = in_data[h * width * post + j * post + w];
-          max_idx = j;
-        }
-      }
-      out_data[i] = max_value;
-      out_idx_data[i] = max_idx;
-    }
-  }
-};
-
-template <typename DeviceContext>
-struct ARange {
-  void operator()(const DeviceContext& dev_ctx, int64_t* data, int end,
-                  int64_t scale) {
-    for (int i = 0; i < end; ++i) {
-      data[i] = i * scale;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
-                  T* max_value) {
-    auto input_ptr = input.data<T>();
-    auto num = input.numel();
-    *max_value = *std::max_element(input_ptr, input_ptr + num);
-  }
-};
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-struct Gather {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
-                  const framework::Tensor& index, framework::Tensor* output) {
-    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const framework::Tensor& lhs,
-                      const framework::Tensor& rhs, framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  OutT* out_ptr = out->data<OutT>();
-  Functor functor;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
-  }
-}
-
-template <typename DeviceContext,
-          template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
-  }
-};
-
-template <bool is_multi_threads>
-struct GetInputIndex {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    for (int j = 0; j < out_dims_size; ++j) {
-      int curr_idx = output_idx / output_strides[j];
-      output_idx %= output_strides[j];
-      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
-      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
-    }
-  }
-};
-
-template <>
-struct GetInputIndex<false> {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
-                                               index_array);
-    *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
-                                               index_array);
-    phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                            index_array);
-  }
-};
-
-template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
-                             const framework::Tensor& rhs,
-                             framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  T* out_ptr = out->data<T>();
-  int out_size = static_cast<int>(out->dims().size());
-  std::vector<int> out_dims(out_size);
-  std::vector<int> lhs_dims(out_size);
-  std::vector<int> rhs_dims(out_size);
-  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
-  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
-  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
-  std::vector<int> output_strides(out_size, 1);
-  std::vector<int> lhs_strides(out_size, 1);
-  std::vector<int> rhs_strides(out_size, 1);
-  std::vector<int> index_array(out_size, 0);
-  // calculate strides
-  for (int i = out_size - 2; i >= 0; --i) {
-    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
-    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
-    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
-  }
-  Functor functor;
-  GetInputIndex<is_multi_threads> get_input_index;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    int lhs_idx = 0;
-    int rhs_idx = 0;
-    get_input_index(lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides,
-                    output_strides, i, index_array.data(), &lhs_idx, &rhs_idx);
-    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
-  }
-}
-
-template <typename DeviceContext, template <typename T> typename BinaryFunctor,
-          typename T>
-struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
-                  const framework::Tensor& rhs, framework::Tensor* output) {
-    if (lhs.dims() == rhs.dims()) {
-      SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
-    } else {
-      bool is_multi_threads = false;
-#ifdef PADDLE_WITH_MKLML
-      if (omp_get_max_threads() > 1) {
-        is_multi_threads = true;
-      }
-#endif
-      if (is_multi_threads) {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(lhs, rhs, output);
-      } else {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(lhs, rhs, output);
-      }
-    }
-  }
-};
-
-class TensorBuffer {
- public:
-  explicit TensorBuffer(const framework::LoDTensor& in)
-      : buffer_(in), offset_(0) {
-    buffer_.Resize({buffer_.numel()});
-  }
-  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
-    int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                   std::multiplies<int64_t>());
-    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
-    offset_ += size;
-    block.Resize(shape);
-    return block;
-  }
-
- private:
-  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
-  int offset_;
-};
-
-template <typename DeviceContext, typename T>
-class ViterbiDecodeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<framework::Tensor>("Input");
-    auto batch_size = static_cast<int>(input->dims()[0]);
-    auto seq_len = static_cast<int>(input->dims()[1]);
-    auto n_labels = static_cast<int>(input->dims()[2]);
-    phi::funcs::SetConstant<DeviceContext, T> float_functor;
-    phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<framework::Tensor> historys;
-    // We create tensor buffer in order to avoid allocating memory frequently
-    // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-    int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    framework::LoDTensor int_buffer;
-    int_buffer.Resize(phi::make_ddim({buffer_size}));
-    int_buffer.mutable_data<int64_t>(ctx.GetPlace());
-    TensorBuffer int_tensor_buffer(int_buffer);
-    // create float tensor buffer
-    // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
-    buffer_size = batch_size * (seq_len + 10) * n_labels +
-                  (batch_size + 2) * n_labels * n_labels;
-    framework::LoDTensor float_buffer;
-    float_buffer.Resize(phi::make_ddim({buffer_size}));
-    float_buffer.mutable_data<T>(ctx.GetPlace());
-    TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<framework::Tensor>("Length");
-    framework::Tensor left_length =
-        int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
-    int64_t max_seq_len = 0;
-    GetMaxValue<DeviceContext, int64_t> get_max_value;
-    get_max_value(dev_ctx, left_length, &max_seq_len);
-
-    auto* scores = ctx.Output<framework::Tensor>("Scores");
-    scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<framework::Tensor>("Path");
-    path->Resize({batch_size, max_seq_len});
-    path->mutable_data<int64_t>(curr_place);
-    framework::Tensor tpath =
-        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
-    auto batch_path = Unbind(tpath);
-    for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
-      it->Resize({batch_size});
-    }
-    // create and init required tensor
-    framework::Tensor input_exp =
-        float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<framework::Tensor>("Transition");
-    framework::Tensor trans_exp =
-        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
-    framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
-    trans_exp.Resize({1, n_labels, n_labels});
-    framework::Tensor alpha =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &zero, 0);
-    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &one, 1);
-    framework::Tensor float_one =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    framework::Tensor alpha_trn_sum =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    framework::Tensor alpha_max =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor alpha_argmax =
-        int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    framework::Tensor alpha_nxt =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor zero_len_mask =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor float_mask =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::Tensor stop_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor start_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor rest_trans =
-        float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor last_ids_tmp =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor batch_offset =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor gather_idx =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
-                                                &start_trans};
-    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
-                                            &start_trans};
-    math::SplitFunctor<DeviceContext, T> split_functor;
-    split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
-    stop_trans.Resize({1, n_labels});
-    start_trans.Resize({1, n_labels});
-    auto logit0 = input_exp.Slice(0, 1);
-    logit0.Resize({batch_size, n_labels});
-    BinaryOperation<DeviceContext, AddFunctor, T> AddFloat;
-    BinaryOperation<DeviceContext, AddFunctor, int64_t> AddInt;
-    BinaryOperation<DeviceContext, MulFunctor, T> MulFloat;
-    BinaryOperation<DeviceContext, MulFunctor, int64_t> MulInt;
-    BinaryOperation<DeviceContext, SubFunctor, T> SubFloat;
-    BinaryOperation<DeviceContext, SubFunctor, int64_t> SubInt;
-    if (include_bos_eos_tag) {
-      AddFloat(dev_ctx, logit0, start_trans, &alpha);
-      GetMask<DeviceContext, EqualFunctor, T>()(ctx, left_length, one,
-                                                &float_mask);
-      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-    } else {
-      alpha = logit0;
-    }
-    SubInt(dev_ctx, left_length, one, &left_length);
-    Argmax<DeviceContext, T, int64_t> argmax;
-    for (int64_t i = 1; i < max_seq_len; ++i) {
-      framework::Tensor logit = input_exp.Slice(i, i + 1);
-      logit.Resize({batch_size, n_labels});
-      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
-      AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
-      auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
-      alpha_argmax_temp.Resize({batch_size, n_labels});
-      argmax(ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
-      historys.emplace_back(alpha_argmax_temp);
-      AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
-      alpha.Resize({batch_size, n_labels});
-      // mask = paddle.cast((left_length > 0), dtype='float32')
-      // alpha = mask * alpha_nxt + (1 - mask) * alpha
-      GetMask<DeviceContext, GreaterThanFunctor, T>()(ctx, left_length, zero,
-                                                      &float_mask);
-      // alpha_nxt = mask * alpha_nxt
-      MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
-      // inv_mask = 1 - mask
-      SubFloat(dev_ctx, float_one, float_mask, &float_mask);
-      // alpha = (1 - mask) * alpha
-      MulFloat(dev_ctx, alpha, float_mask, &alpha);
-      // alpha += alpha_nxt
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      if (include_bos_eos_tag) {
-        GetMask<DeviceContext, EqualFunctor, T>()(ctx, left_length, one,
-                                                  &float_mask);
-        // alpha += mask * trans_exp[:, self.stop_idx]
-        MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-        AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      }
-      SubInt(dev_ctx, left_length, one, &left_length);
-    }
-    argmax(ctx, alpha, &last_ids, scores, 1);
-    left_length.Resize({batch_size});
-    GetMask<DeviceContext, GreaterEqualFunctor, int64_t>()(ctx, left_length,
-                                                           zero, &int_mask);
-    // last_ids_update = last_ids * tag_mask
-    int last_ids_index = 1;
-    int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
-    MulInt(dev_ctx, last_ids, int_mask,
-           &batch_path[actual_len - last_ids_index]);
-    // The algorithm below can refer to
-    // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
-    ARange<DeviceContext> arange;
-    arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
-    Gather<DeviceContext, int64_t, int64_t> gather;
-    for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
-      ++last_ids_index;
-      AddInt(dev_ctx, left_length, one, &left_length);
-      AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      framework::Tensor& last_ids_update =
-          batch_path[actual_len - last_ids_index];
-      hist->Resize({batch_size * n_labels});
-      gather(dev_ctx, *hist, gather_idx, &last_ids_update);
-      GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,
-                                                            zero, &int_mask);
-      MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
-      GetMask<DeviceContext, EqualFunctor, int64_t>()(ctx, left_length, zero,
-                                                      &zero_len_mask);
-      MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
-      SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
-      MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
-      AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
-      GetMask<DeviceContext, LessThanFunctor, int64_t>()(ctx, left_length, zero,
-                                                         &int_mask);
-      MulInt(dev_ctx, last_ids, int_mask, &last_ids);
-      AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
-    }
-    TransCompute<DeviceContext, int64_t>(2, dev_ctx, tpath, path, {1, 0});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op.cc b/paddle/fluid/operators/where_index_op.cc
index 2bffeb500ce50e3bc5a3d72a085da826d06e849d..733d0f7af92d727bd3eff31a87e7e88b3d073829 100644
--- a/paddle/fluid/operators/where_index_op.cc
+++ b/paddle/fluid/operators/where_index_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +24,6 @@ class WhereIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "where");
-    PADDLE_ENFORCE_GE(
-        ctx->GetInputDim("Condition").size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input(Condition) should have number of dimension at least 1"));
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "where");
-    ctx->SetOutputDim("Out", {-1, ctx->GetInputDim("Condition").size()});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -53,11 +46,10 @@ class WhereIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(where_index, ops::WhereIndexOp,
-                             ops::WhereIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(where_index, ops::CPUWhereIndexKernel<int64_t>,
-                       ops::CPUWhereIndexKernel<int>,
-                       ops::CPUWhereIndexKernel<int16_t>,
-                       ops::CPUWhereIndexKernel<bool>,
-                       ops::CPUWhereIndexKernel<float>,
-                       ops::CPUWhereIndexKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(where_index, WhereIndexInferShapeFunctor,
+                            PD_INFER_META(phi::WhereIndexInferMeta));
+REGISTER_OPERATOR(
+    where_index, ops::WhereIndexOp, ops::WhereIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    WhereIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
deleted file mode 100644
index c594e478aa0f3cb36b2bb63bdd1dc22e87613bf0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_index_op.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
-  }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
-                             const int64_t numel, const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *condition = context.Input<framework::Tensor>("Condition");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
-
-    const T *cond_data = condition->data<T>();
-    const int64_t numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-    auto h_array_mem =
-        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-    // "stride_array" is an array and len(stride_array)==rank,
-    // each element is the stride of each dimension -- the length from i to i+1.
-    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-    // "true_num_array" is an array and len(stride_array)==numel,
-    // at the beginning,
-    // "true_num_array" will set 1 if condition[i] == true else 0,
-    // then it will be calculated by cub::InclusiveSum,
-    // so that we can get the true number before i as the out index
-    int64_t *d_true_num_array = d_stride_array + rank;
-
-    // the total_true_num is the total number of condition[i] == true
-    int64_t *h_total_true_num = h_stride_array + rank;
-
-    // alloce cub memory
-    size_t cub_size = 0;
-    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-    void *cub_data = cub_mem->ptr();
-
-    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-    const int threads = std::min(numel, static_cast<int64_t>(128));
-    const int64_t need_grids = (numel + threads - 1) / threads;
-    const int grids = std::min(need_grids, static_cast<int64_t>(256));
-    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
-                                                           d_true_num_array);
-
-    // calculate the inclusive prefix sum of "true_num_array"
-    // to get the index of "out" tensor,
-    // and the total number of cond_data[i]==true.
-    // Example:
-    // condition: F T T F F F T T
-    // before:    0 1 1 0 0 0 1 1
-    // after:     0 1 2 2 2 2 3 4
-    // out:       1 2 6 7
-    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-
-    // calculate each dimension's stride
-    h_stride_array[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-    }
-    memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(),
-                 h_stride_array, rank * sizeof(int64_t), dev_ctx.stream());
-
-    // get total ture number and set output size
-    // the last element of cub::InclusiveSum is the total number
-    memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(),
-                 d_true_num_array + numel - 1, sizeof(int64_t),
-                 dev_ctx.stream());
-    dev_ctx.Wait();
-
-    int64_t true_num = *h_total_true_num;
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    // using true_num_array and stride_array to calculate the output index
-    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(where_index, ops::CUDAWhereIndexKernel<int64_t>,
-                        ops::CUDAWhereIndexKernel<int>,
-                        ops::CUDAWhereIndexKernel<int16_t>,
-                        ops::CUDAWhereIndexKernel<bool>,
-                        ops::CUDAWhereIndexKernel<float>,
-                        ops::CUDAWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
deleted file mode 100644
index 193a2386e6bd1eb19d30b7c9e146eb8b77b8e851..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_index_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct WhereIndexFunctor {
-  WhereIndexFunctor(const T* true_index, int true_num, const T* stride,
-                    int rank, T* out)
-      : true_index_(true_index),
-        true_num_(true_num),
-        stride_(stride),
-        rank_(rank),
-        out_ptr_(out) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T index = true_index_[idx];
-    for (int j = 0; j < rank_; j++) {
-      out_ptr_[idx * rank_ + j] = index / stride_[j];
-      index -= out_ptr_[idx * rank_ + j] * stride_[j];
-    }
-  }
-
-  const T* true_index_;
-  int true_num_;
-  const T* stride_;
-  int rank_;
-  T* out_ptr_;
-};
-
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
-
-template <typename T>
-class CPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    std::vector<int64_t> true_index;
-    for (auto i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        true_index.push_back(i);
-      }
-    }
-    auto true_num = true_index.size();
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    std::vector<int64_t> stride(rank);
-    stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      stride[i] = stride[i + 1] * dims[i + 1];
-    }
-
-    auto& dev_ctx = context.template device_context<CPUDeviceContext>();
-    WhereIndexFunctor<int64_t> functor(true_index.data(), true_num,
-                                       stride.data(), rank, out_ptr);
-    platform::ForRange<CPUDeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 59f598d2ad6a3275158cadf32bd1bf2086a3487a..2f8744c2c0448881901656102f0ee65279f159a2 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc
deleted file mode 100644
index 3322eefd887e3d4dce5363cca842305931822a23..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/where_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class WhereIndexXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
-    int true_num_cpu;
-    int ret =
-        xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
-            ret, XPUAPIErrorMsg[ret]));
-
-    memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
-                 context.GetPlace(), static_cast<void*>(true_num),
-                 sizeof(int32_t));
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-    if (true_num_cpu == 0) {
-      return;
-    }
-
-    auto condition_shape = phi::vectorize<int>(dims);
-    ret = xpu::where(dev_ctx.x_context(), cond_data, out_data, condition_shape,
-                     true_num_cpu);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU masked_select kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(where_index, ops::WhereIndexXPUKernel<int>,
-                       ops::WhereIndexXPUKernel<bool>,
-                       ops::WhereIndexXPUKernel<float>);
-#endif
diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
index 0f10efefa137b698b59db23b67122df990cfa366..acbfee30670b120c292ab802d8d178295e4a4b57 100644
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -117,8 +117,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
-                            PT_INFER_META(phi::WhereInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PD_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                   ops::WhereOpGradMaker<paddle::framework::OpDesc>,
                   ops::WhereOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index ecad5340d71c1ae32339ab1c79bf37d947402747..cbf3fdd263b488e40a0bd94b9c12f4cb45a29d22 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,15 +1,3 @@
-IF(WITH_CUSTOM_DEVICE)
-cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
-
-cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
-
-cc_library(stream SRCS stream.cc DEPS callback_manager)
-
-cc_library(event SRCS event.cc DEPS enforce place)
-
-cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
-
-ENDIF()
 
 set(DEV_LIBS custom_device)
 
@@ -37,11 +25,3 @@ ENDIF()
 IF(WITH_MLU)
   add_subdirectory(mlu)
 ENDIF()
-
-# CUSTOM
-IF(WITH_CUSTOM_DEVICE)
-  add_subdirectory(custom)
-
-  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
-  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
-ENDIF()
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
deleted file mode 100644
index f39c60c0c68edcdaca4bd4a0b25a9ec07453280e..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-IF(WITH_CUSTOM_DEVICE)
-cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
-cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
-ENDIF()
diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h
index fbdb4627aba2662a2a12cc933a3a4c6e61aa55d5..ba92b4ac7deaecda8ed6553651ef2f80510bea70 100644
--- a/paddle/fluid/platform/device/custom/enforce_custom.h
+++ b/paddle/fluid/platform/device/custom/enforce_custom.h
@@ -14,7 +14,10 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_ext.h"
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/device_ext.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index ba3461d8c14871561b2d069f9350698306e22366..6803a39a4fd7fd3099aa96e685b5b68acf96750e 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -40,10 +40,10 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/callback_manager.h"
 #include "paddle/fluid/platform/device/custom/enforce_custom.h"
-#include "paddle/fluid/platform/device/device_guard.h"
-#include "paddle/fluid/platform/device/device_manager.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 #endif
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index ab7d474c1ac38eee83addfba09eca11601c05976..a32db3a9921e3db0f9eee933b9f98264050d695b 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -110,5 +111,28 @@ class CublasHandleHolder {
   mutable std::mutex mtx_;
 };
 
+class CublasLtHandleHolder {
+ public:
+  CublasLtHandleHolder() {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtCreate(&handle_));
+  }
+  const cublasLtHandle_t& GetCublasLtHandle() const { return handle_; }
+
+  ~CublasLtHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtDestroy(handle_));
+  }
+
+  inline void Call(const std::function<void(blasLtHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasLtHandleHolder);
+
+  cublasLtHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index d7362fe9cbd81d65299987da32067c2ad54084ce..d0b48eca5021bdd971eecd38642cc780e9d6a6bb 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +25,7 @@
 #else
 #include <cuda_runtime.h>
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
 
@@ -70,6 +72,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index efbc56bee720b0b5ed4e6aa91d6a7ad0ad14b197..134ec04030d75f5ab98bd9789b8b5fe87341333f 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -53,6 +53,23 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline HcclDataType ToHCCLDataType(experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == experimental::DataType::FLOAT16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == experimental::DataType::INT64) {
+    return HCCL_DATA_TYPE_INT64;
+  } else if (type == experimental::DataType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == experimental::DataType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e6b08ed7bc340b5150078fe0deb6a3187fb8e17b..14f516235a720c1fb8f46fe6606ac8f0bdb149f9 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -249,7 +249,7 @@ XPUOpMap& get_kl2_ops() {
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::INT64, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                     pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -323,6 +323,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -349,6 +351,8 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace())})},
       {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::BOOL, XPUPlace()),
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index c5dff84723ccf4f40065f5a1d13cf5cdce8b3a0f..ce9b09f60ca352a0cc33d2e477134ca2e10c2ad2 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -56,6 +56,9 @@ XPUOpMap& get_kp_ops() {
       {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_sigmoid",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"thresholded_relu",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 6127fcfa8def6f2a6723416c6a29bd41a4871b74..073851433620130a3c3c6d256a4d6ca3b3f74555 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -23,12 +23,9 @@ namespace paddle {
 namespace platform {
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kl2_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kl2_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
 #ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kp_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kp_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
@@ -117,6 +111,22 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU_KP
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version) {
+  std::vector<vartype::Type> res;
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                              : get_kp_ops();
+  if (ops.find(op_name) != ops.end()) {
+    XPUKernelSet& type_set = ops[op_name];
+    for (auto& item : type_set) {
+      res.push_back(item.data_type_);
+    }
+  }
+  return res;
+}
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 455a38e36fe0ad756021eb5ac23c012f65cc0c6a..60926dd9a5660ee13be7d61eb453740207994029 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -31,6 +31,8 @@ bool is_in_xpu_black_list(const std::string& op_name);
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version);
 #endif
 
 std::vector<vartype::Type> get_xpu_op_support_type(
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6a7956628f80464740e3cd812b0b663cc36d6fc6..18ac979b48ef39fbab841927bf41c9c8579a5766 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -465,6 +467,9 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
   InitCuBlasContext();
   InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   InitCuSparseContext();
   InitCuSolverContext();
 #endif
@@ -476,6 +481,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     DestoryCuDNNContext();
     DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    DestoryCuBlasLtContext();
+#endif
     DestoryCuSolverContext();
 #endif
 
@@ -485,6 +493,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     InitCuBlasContext();
     InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    InitCuBlasLtContext();
+#endif
     InitCuSolverContext();
 #endif
   }
@@ -495,6 +506,9 @@ CUDAContext::~CUDAContext() {
   DestoryCuDNNContext();
   DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   DestoryCuSparseContext();
   DestoryCuSolverContext();
 #endif
@@ -551,6 +565,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
   }
   return phi::GPUContext::cublas_handle();
 }
+#if CUDA_VERSION >= 11060
+cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CublasLtHandle()->GetCublasLtHandle();
+  }
+  return phi::GPUContext::cublaslt_handle();
+}
+#endif
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
   if (thread_ctx_.count(this)) {
     return context()->CusparseHandle()->GetCusparseHandle();
@@ -903,7 +925,7 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 CustomDeviceContext::CustomDeviceContext(CustomPlace place)
     : phi::CustomContext(place) {
   Init();
-  stream_.reset(new platform::stream::Stream(place, stream()));
+  stream_.reset(new phi::stream::Stream(place, stream()));
 }
 
 CustomDeviceContext::~CustomDeviceContext() {}
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e9124dfc1f8a7ad3a88c843c1a1573ba3503d80b..e104170ca24954b356232af4e0c3e97cfb222858 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -29,6 +31,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
@@ -72,8 +75,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
 
-#include "paddle/fluid/platform/device/device_ext.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/stream.h"
 
 #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -332,6 +335,12 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  const std::unique_ptr<CublasLtHandleHolder>& CublasLtHandle() const {
+    return cublaslt_handle_;
+  }
+#endif
+
   const std::unique_ptr<CusparseHandleHolder>& CusparseHandle() const {
     return cusparse_handle_;
   }
@@ -348,6 +357,14 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  /*! \brief  Call cublasLt function safely. */
+  inline void CublasLtCall(
+      const std::function<void(blasLtHandle_t)>& callback) const {
+    cublaslt_handle_->Call(callback);
+  }
+#endif
+
   /*! \brief  Call cusparse function safely. */
   inline void CusparseCall(
       const std::function<void(phi::sparseHandle_t)>& callback) const {
@@ -394,6 +411,12 @@ class CUDAContext {
 #endif
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void InitCuBlasLtContext() {
+    cublaslt_handle_.reset(new CublasLtHandleHolder());
+  }
+#endif
+
   void InitCuSparseContext() {
     cusparse_handle_.reset(new CusparseHandleHolder(RawStream()));
   }
@@ -472,6 +495,10 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); }
+#endif
+
   void DestoryCuSparseContext() { cusparse_handle_.reset(); }
 #endif
 
@@ -497,6 +524,9 @@ class CUDAContext {
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  std::unique_ptr<CublasLtHandleHolder> cublaslt_handle_;
+#endif
   cusolverDnHandle_t cusolver_dn_handle_;
   std::unique_ptr<CusparseHandleHolder> cusparse_handle_;
 #endif
@@ -559,6 +589,7 @@ class CUDADeviceContext : public phi::GPUContext {
   rocblas_handle cublas_handle() const;
 #else
   cublasHandle_t cublas_handle() const;
+  cublasLtHandle_t cublaslt_handle() const;
   cusparseHandle_t cusparse_handle() const;
 #endif
 
@@ -838,7 +869,7 @@ class CustomDeviceContext : public phi::CustomContext {
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
  private:
-  std::shared_ptr<platform::stream::Stream> stream_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 template <>
 struct DefaultDeviceContextType<platform::CustomPlace> {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 87aa5dcde626bafd5e605cc9e35de7cf1b589569..1f95e1212710412cfae1c7284f08a341d8d0dc48 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -46,8 +46,6 @@ if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
-cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack)
-add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
 if (MKL_FOUND AND WITH_ONEMKL)
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
deleted file mode 100644
index 59e04dbd2a1e78912d16f01c2d26048d9c74aed5..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/lapack.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <complex>
-#include <mutex>
-#include "paddle/phi/backends/dynload/lapack.h"
-#include "paddle/phi/common/complex.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                     \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_LAPACK_WRAP(__name) \
-  DYNAMIC_LOAD_LAPACK_WRAP(__name)
-
-#define LAPACK_ROUTINE_EACH(__macro) \
-  __macro(dgetrf_);                  \
-  __macro(sgetrf_);                  \
-  __macro(zheevd_);                  \
-  __macro(cheevd_);                  \
-  __macro(dsyevd_);                  \
-  __macro(ssyevd_);                  \
-  __macro(dgeev_);                   \
-  __macro(sgeev_);                   \
-  __macro(zgeev_);                   \
-  __macro(cgeev_);                   \
-  __macro(dgels_);                   \
-  __macro(sgels_);                   \
-  __macro(dgelsd_);                  \
-  __macro(sgelsd_);                  \
-  __macro(dgelsy_);                  \
-  __macro(sgelsy_);                  \
-  __macro(dgelss_);                  \
-  __macro(sgelss_);                  \
-  __macro(zpotrs_);                  \
-  __macro(cpotrs_);                  \
-  __macro(dpotrs_);                  \
-  __macro(spotrs_);
-
-LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
-
-#undef DYNAMIC_LOAD_LAPACK_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index f26c4fdd17ad7290c71eddf80874f7fa9e115e4f..39eefab774dbe84801bda98c9821d8c801e7fd25 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
-#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define INT_BITS 32
 
@@ -25,7 +25,7 @@ namespace platform {
 struct FastDivMod {
   // 1st value represents the result of input number divides by recorded divisor
   // 2nd value represents the result of input number modulo by recorded divisor
-  using DivModT = AlignedVector<uint32_t, 2>;
+  using DivModT = phi::AlignedVector<uint32_t, 2>;
 
   FastDivMod() {}
   HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 71fd0d20143a08b065eb596d6f5f9ac6531057f4..293a71dbd968c6068579527bf7294453d39f96d9 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/place.h"
 
 #ifdef PADDLE_WITH_XPU
@@ -54,7 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
 
-#include "paddle/fluid/framework/custom_kernel.h"
+#include "paddle/phi/core/custom_kernel.h"
 
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
@@ -144,7 +145,7 @@ void InitCupti() {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 void LoadCustomDevice(const std::string &library_dir) {
   LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]";
-  std::vector<std::string> libs = platform::ListAllLibraries(library_dir);
+  std::vector<std::string> libs = phi::ListAllLibraries(library_dir);
   for (const auto &lib_path : libs) {
     auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
     PADDLE_ENFORCE_NOT_NULL(
@@ -152,15 +153,17 @@ void LoadCustomDevice(const std::string &library_dir) {
         platform::errors::InvalidArgument(
             "Fail to open library: %s with error: %s", lib_path, dlerror()));
 
-    platform::LoadCustomRuntimeLib(lib_path, dso_handle);
-    framework::LoadCustomKernelLib(lib_path, dso_handle);
+    phi::LoadCustomRuntimeLib(lib_path, dso_handle);
   }
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
   LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir
             << "]";
 }
 #endif
 
 void InitDevices() {
+  // set name at the entry point of Paddle
+  platform::SetCurrentThreadName("MainThread");
 // CUPTI attribute should be set before any CUDA context is created (see CUPTI
 // documentation about CUpti_ActivityAttribute).
 #ifdef PADDLE_WITH_CUDA
@@ -256,9 +259,9 @@ void InitDevices(const std::vector<int> devices) {
       LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
       LoadCustomDevice(custom_kernel_root);
 
-      auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+      auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
       for (auto &dev_type : device_types) {
-        auto device_count = platform::DeviceManager::GetDeviceCount(dev_type);
+        auto device_count = phi::DeviceManager::GetDeviceCount(dev_type);
         LOG(INFO) << "CustomDevice: " << dev_type
                   << ", visible devices count: " << device_count;
         for (size_t i = 0; i < device_count; i++) {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 01de7349f4823a66b2d180f3d1493477f361273a..1254331835bbdf4dfc698021a52208d846651dd5 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -618,7 +618,7 @@ class BinaryMKLDNNHandler
                       const dnnl::engine engine, platform::Place cpu_place,
                       const Tensor* x, const Tensor* y, Tensor* z,
                       float scale_x, float scale_y, float scale_z,
-                      const dnnl::post_ops& post_ops = dnnl::post_ops())
+                      const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     PADDLE_ENFORCE_EQ(
         x->layout(), DataLayout::kMKLDNN,
@@ -676,8 +676,8 @@ class BinaryMKLDNNHandler
     const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                      MKLDNNMemoryFormat::any);
 
-    auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
-    attributes.set_post_ops(post_ops);
+    auto attributes =
+        CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
 
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
@@ -690,10 +690,9 @@ class BinaryMKLDNNHandler
   }
 
  private:
-  static inline dnnl::primitive_attr CreateAttributes(dnnl::algorithm op,
-                                                      float scale_x,
-                                                      float scale_y,
-                                                      float scale_z) {
+  static inline dnnl::primitive_attr CreateAttributes(
+      dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
+      dnnl::post_ops post_ops = dnnl::post_ops{}) {
     // Scales set in attributes for inputs contibute to the output equation
     // in the following way (assuming no broadcasting takes place):
     // output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
@@ -718,6 +717,7 @@ class BinaryMKLDNNHandler
                           {scale_0});
     attributes.set_scales(/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0,
                           {scale_1});
+    if (post_ops.len() > 0) attributes.set_post_ops(post_ops);
     return attributes;
   }
 };
diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc
index b309bb985122d8bbe28a8014bca51b4a5a6b9b10..b3311f1d19e6304a0b232cd936397559224e9b96 100644
--- a/paddle/fluid/platform/os_info_test.cc
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
   using paddle::platform::GetCurrentThreadName;
   using paddle::platform::SetCurrentThreadName;
   using paddle::platform::GetAllThreadNames;
-  EXPECT_EQ("unset", GetCurrentThreadName());
-  EXPECT_TRUE(SetCurrentThreadName("MainThread"));
+  SetCurrentThreadName("MainThread");
   EXPECT_FALSE(SetCurrentThreadName("MainThread"));
   auto names = GetAllThreadNames();
   EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 866bf3c66aa2a077f406921da15ee1fa2acd7d40..940fc98d3b32021ae8b278305c54d8819292daaf 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -77,7 +77,9 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
 #endif
 #endif
   if (FLAGS_enable_host_event_recorder_hook == false) {
-    OriginalConstruct(name, role, "none");
+    if (g_state != ProfilerState::kDisabled) {  // avoid temp string
+      OriginalConstruct(name, role, "none");
+    }
     return;
   }
   if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
@@ -165,8 +167,8 @@ void RecordEvent::End() {
   }
 #endif
 #endif
-  uint64_t end_ns = PosixInNsec();
   if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
+    uint64_t end_ns = PosixInNsec();
     if (LIKELY(shallow_copy_name_ != nullptr)) {
       HostEventRecorder::GetInstance().RecordEvent(
           shallow_copy_name_, start_ns_, end_ns, role_, type_);
@@ -190,6 +192,7 @@ void RecordEvent::End() {
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
+    uint64_t end_ns = PosixInNsec();
     tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
                           g_thread_id);
   }
@@ -489,6 +492,10 @@ void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
 
 void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; }
 
+void DisableHostEventRecorder() {
+  FLAGS_enable_host_event_recorder_hook = false;
+}
+
 std::string PrintHostEvents() {
   std::ostringstream oss;
   auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 122e19b7c28080c3c7b38ba90cd8d5c3d61e6ecf..78275341cbbf747d618eeb5f96352325d29bbafb 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -216,6 +216,7 @@ void NvprofEnableRecordEvent();
 void NvprofDisableRecordEvent();
 
 void EnableHostEventRecorder();
+void DisableHostEventRecorder();
 
 // Defined for UT
 std::string PrintHostEvents();
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 49f9362527591744dd0685375e0244673a7b3081..afd4135246556624cb022243e0e98b5ad9f9f6da 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -189,7 +189,10 @@ struct ThreadEventSection {
 
 class ThreadEventRecorder {
  public:
-  ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); }
+  ThreadEventRecorder() {
+    thread_id_ = GetCurrentThreadSysId();
+    thread_name_ = GetCurrentThreadName();
+  }
 
   DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
 
@@ -202,7 +205,7 @@ class ThreadEventRecorder {
 
   ThreadEventSection GatherEvents() {
     ThreadEventSection thr_sec;
-    thr_sec.thread_name = GetCurrentThreadName();
+    thr_sec.thread_name = thread_name_;
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
     return thr_sec;
@@ -210,6 +213,7 @@ class ThreadEventRecorder {
 
  private:
   uint64_t thread_id_;
+  std::string thread_name_;
   EventContainer<CommonEvent> base_evt_cntr_;
 };
 
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 35dbc96874d3c8e09fec3ad9b440619b180647fe..46cbb3358c6c4d6b2b17cfc1e549db6376931389 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -95,7 +95,7 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
       collector.ThreadNames();
   for (const auto& kv : thread_names) {
     extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
-                           kv.second);
+                           std::string("%s"), kv.second.c_str());
   }
   return std::unique_ptr<ProfilerResult>(
       new platform::ProfilerResult(std::move(tree), extrainfo));
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 7fce0296d437a096cd9aa99a30476fc44e34c703..7148afee273fda61f991e386353fc323dd2f2ea2 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,10 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
-  LOG(ERROR) << "failed to call MLULaunchCallback, "
-             << "because mlu not support StreamAddCallback yet. "
-             << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_
+          << " Failed to call MLULaunchCallback, "
+          << "because mlu not support StreamAddCallback yet. "
+          << "function: " << func;
 #endif
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 48d42f803a8248f733c6b4b0a9a52c2c70a3ef32..7b223f7ed27e2249d84539a81312658f8c2260f0 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -44,6 +44,9 @@ endif()
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
+  if (WITH_GPU)
+    set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator)
+  endif()
   if (WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
@@ -80,11 +83,21 @@ set(PYBIND_SRCS
   communication.cc
   cuda_streams_py.cc)
 
+if (WITH_ONNXRUNTIME)
+  set(PYBIND_DEPS ${PYBIND_DEPS} onnxruntime_predictor)
+endif()
+
 if(NOT ON_INFER)
   set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
   endif()
+  if (WITH_GLOO)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
+  endif()
+  if(WITH_ASCEND_CL)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
+  endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()
 
@@ -146,6 +159,10 @@ if(WITH_PYTHON)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
   endif(WITH_ASCEND_CL)
 
+  if (WITH_ONNXRUNTIME)
+    list(APPEND OP_FUNCTION_GENERETOR_DEPS onnxruntime_predictor)
+  endif()
+
   if(WITH_CNCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
   endif(WITH_CNCL)
@@ -236,6 +253,19 @@ if(WITH_PYTHON)
         list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
         list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
+    if(WITH_ONNXRUNTIME)
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+    endif()
 
     add_custom_command(OUTPUT ${impl_file}
       COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
@@ -254,6 +284,28 @@ if(WITH_PYTHON)
     # copy these *.so to current directory and append current directory to
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
+    if(WITH_ONNXRUNTIME)
+      if (APPLE)
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
+      else()
+        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
+        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
+      endif()
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${PADDLE2ONNX_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle2onnx)
+      list(APPEND OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
+
+      ADD_CUSTOM_COMMAND(OUTPUT ${ONNXRUNTIME_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+        DEPENDS onnxruntime)
+      list(APPEND OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+      list(APPEND EAGER_OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
+    endif()
+
     if(WITH_MKLML)
       ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
@@ -299,7 +351,7 @@ if(WITH_PYTHON)
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python custom_operator custom_operator_node)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index a0d2777f825dc592e19230bc2ba4412f943d0c2b..1a6a395545a96b1980cae73ff65de3daef0acafc 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -30,12 +30,42 @@ namespace pybind {
 
 using TCPStore = paddle::distributed::TCPStore;
 
-void BindTCPStore(py::module* m) {
-  py::class_<TCPStore>(*m, "TCPStore")
-      .def(
-          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
-      .def("add", &TCPStore::add)
-      .def("get", &TCPStore::get);
+void BindTCPStore(py::module *m) {
+  auto Store =
+      py::class_<distributed::Store, std::shared_ptr<distributed::Store>>(
+          *m, "Store")
+          .def(py::init<>())
+          .def("set",
+               [](distributed::Store &self, const std::string &key,
+                  const std::string &value) {
+                 std::vector<uint8_t> data(value.begin(), value.end());
+                 self.set(key, data);
+               },
+               py::arg("key"), py::arg("value"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("get",
+               [](distributed::Store &self,
+                  const std::string &key) -> py::bytes {
+                 auto data = self.get(key);
+                 return py::bytes(reinterpret_cast<char *>(data.data()),
+                                  data.size());
+               },
+               py::arg("key"), py::call_guard<py::gil_scoped_release>())
+          .def("add", &distributed::Store::add,
+               py::call_guard<py::gil_scoped_release>())
+          .def("wait", &distributed::Store::wait,
+               py::call_guard<py::gil_scoped_release>());
+
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore", Store)
+      .def(py::init([](std::string hostname, uint16_t port, bool is_master,
+                       size_t world_size, std::chrono::seconds timeout) {
+             return std::make_shared<TCPStore>(hostname, port, is_master,
+                                               world_size, timeout);
+           }),
+           py::arg("hostname"), py::arg("port"), py::arg("is_master"),
+           py::arg("world_size"),
+           py::arg("timeout") = distributed::tcputils::kNoTimeout,
+           py::call_guard<py::gil_scoped_release>());
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b898ce77ce6fb43ca9aaba38e5db9e01a1d19d3
--- /dev/null
+++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+
+static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
+                                       PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args),
+                               attrs);
+
+    tstate = PyEval_SaveThread();
+    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+  }
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef CustomEagerFinalStateMethods[] = {
+    {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
+
+    {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index a4a1d07db2cb9771530ddb5be0696cef38b2c344..1df917b8c3594d4505d9e92cd9a8c64bffd50279 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -35,6 +35,15 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#endif
+
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#endif
+
 namespace py = pybind11;
 
 namespace paddle {
@@ -42,6 +51,26 @@ namespace pybind {
 
 using Tensor = paddle::experimental::Tensor;
 
+std::shared_ptr<distributed::EagerReducer> CreateEagerReducer(
+    py::handle py_tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits, bool find_unused_parameters) {
+  auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+  return std::make_shared<distributed::EagerReducer>(
+      params, group_indices, is_sparse_gradient, process_group,
+      group_size_limits, find_unused_parameters);
+}
+
+#if defined(PADDLE_WITH_GLOO)
+using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
+using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
+using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
+#endif
+
+static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";  // NOLINT
+
 void BindDistributed(py::module *m) {
   py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
       .value("SUM", distributed::ReduceOp::SUM)
@@ -64,6 +93,11 @@ void BindDistributed(py::module *m) {
       .def(py::init<>())
       .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
 
+  py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
+      .def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
+
   auto ProcessGroup =
       py::class_<distributed::ProcessGroup,
                  std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
@@ -121,14 +155,75 @@ void BindDistributed(py::module *m) {
                  return self.Recv(tensors, src);
                },
                py::arg("tensor"), py::arg("src"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("all_gather",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllGather(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("alltoall",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllToAll(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("reduce",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  int dst, distributed::ReduceOp op) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 distributed::ReduceOptions opts;
+                 opts.reduce_op = op;
+                 opts.root_rank = dst;
+                 std::vector<Tensor> tensors = {in_tensor};
+                 return self.Reduce(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("dst"),
+               py::arg("op") = distributed::ReduceOp::SUM,
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("scatter",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor, int src) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 distributed::ScatterOptions opts;
+                 opts.root_rank = src;
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.Scatter(in_tensors, out_tensors, opts);
+               },
+               py::arg("in"), py::arg("out"), py::arg("src"),
                py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL)
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
            py::call_guard<py::gil_scoped_release>());
+#endif
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+  py::class_<distributed::ProcessGroupHCCL,
+             std::shared_ptr<distributed::ProcessGroupHCCL>>(
+      *m, "ProcessGroupHCCL", ProcessGroup)
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
+           py::call_guard<py::gil_scoped_release>());
+#endif
 
   py::class_<distributed::ProcessGroup::Task,
              std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
@@ -138,45 +233,43 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
            py::call_guard<py::gil_scoped_release>());
-#endif
 
-  // define parallel strategy, it will be removed
-  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
-      *m, "ProcessGroupStrategy", "");
-  pg_strategy.def(py::init())
-      .def_property("nranks",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nranks_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nranks) {
-                      self.nranks_ = nranks;
-                    })
-      .def_property("local_rank",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       int local_rank) { self.local_rank_ = local_rank; })
-      .def_property(
-          "trainer_endpoints",
-          [](const distributed::ProcessGroupStrategy &self) {
-            return self.trainer_endpoints_;
-          },
-          [](distributed::ProcessGroupStrategy &self,
-             std::vector<std::string> eps) { self.trainer_endpoints_ = eps; })
-      .def_property("current_endpoint",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; })
-      .def_property("nrings",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nrings_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nrings) {
-                      self.nrings_ = nrings;
-                    });
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<GlooOptions>(*m, "GlooOptions")
+      .def(py::init<>())
+      .def_readwrite("_device", &GlooOptions::device)
+      .def_static("create", &GlooOptions::create);
+
+  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
+      .def(py::init(
+               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
+                 return std::make_shared<GlooStore>(store);
+               }),
+           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
+      *m, "ProcessGroupGloo", ProcessGroup)
+      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
+                    std::shared_ptr<GlooOptions> &>(),
+           py::call_guard<py::gil_scoped_release>())
+      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
+                       int world_size) {
+             auto opts = GlooOptions::create();
+             char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
+             if (ifname && strlen(ifname) > 1) {
+               opts->device = ProcessGroupGloo::createDeviceForInterface(
+                   std::string(ifname));
+             } else {
+               opts->device = ProcessGroupGloo::createDefaultDevice();
+             }
+             return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
+                                                       opts);
+           }),
+           py::arg("store"), py::arg("rank"), py::arg("world_size"),
+           py::call_guard<py::gil_scoped_release>())
+      .def_static("create_default_device",
+                  &ProcessGroupGloo::createDefaultDevice);
+#endif
 
   m->def("eager_assign_group_by_size",
          [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
@@ -190,6 +283,17 @@ void BindDistributed(py::module *m) {
          py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
          py::arg("tensor_indices") = std::vector<int64_t>{},
          py::call_guard<py::gil_scoped_release>());
+
+  py::class_<distributed::EagerReducer,
+             std::shared_ptr<distributed::EagerReducer>>(*m, "EagerReducer",
+                                                         R"DOC()DOC")
+      .def(py::init(&CreateEagerReducer))
+      .def("prepare_for_backward",
+           [](distributed::EagerReducer &self, py::handle py_tensors) {
+             auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+             self.PrepareForBackward(params);
+           },
+           py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 0b04dc7347ce78f87d6f8d81e30eb4135fd965ed..e110432c67d395c865d934a47eaa4a803053db8b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -21,21 +21,25 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 namespace paddle {
 namespace pybind {
 
@@ -168,7 +172,276 @@ static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+    return;
+  } else {
+    VLOG(7) << "Construct CustomEdgesSlotMap ";
+    auto inputs_names =
+        paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[0]);
+    auto outputs_names =
+        paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[0]);
+    auto attrs_names =
+        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[0]);
+    auto grad_outputs_names =
+        paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+    auto grad_inputs_names =
+        paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+    auto grad_attrs_names =
+        paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+    std::vector<std::unordered_map<int, int>> res(5);
+    in_out_map.insert({op_type, res});
+    // Prepare pos map for grad_outputs
+    VLOG(7) << "Prepare pos map for grad_outputs";
+    PADDLE_ENFORCE_LE(
+        grad_outputs_names.size(), inputs_names.size(),
+        paddle::platform::errors::InvalidArgument(
+            "Grad outputs num should be less equal than forward inputs num."));
+    for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+      size_t end = grad_outputs_names[i].find("@GRAD");
+      PADDLE_ENFORCE_NE(
+          end, std::string::npos,
+          paddle::platform::errors::NotFound(
+              "All Grad outputs should be grad and we got %s is not grad var, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                  << " inputs: " << inputs_names[j] << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][0][j] = i;
+        }
+      }
+    }
+    // Prepare pos map for grad_inputs
+    for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+      size_t end = grad_inputs_names[i].find("@GRAD");
+      if (end != std::string::npos) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i].substr(0, end) == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                    << " outputs: " << outputs_names[j] << " related to No."
+                    << i << " grad_inputs's grad: " << grad_inputs_names[i];
+            in_out_map[op_type][1][j] = i;
+          }
+        }
+      } else {
+        if (std::find(outputs_names.begin(), outputs_names.end(),
+                      grad_inputs_names[i]) != outputs_names.end()) {
+          for (size_t j = 0; j < outputs_names.size(); j++) {
+            if (grad_inputs_names[i] == outputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                      << " outputs: " << outputs_names[j] << " related to No."
+                      << i
+                      << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+              in_out_map[op_type][2][j] = i;
+            }
+          }
+        } else {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_inputs_names[i] == inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                      << " inputs: " << inputs_names[j] << " related to No."
+                      << i
+                      << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+              in_out_map[op_type][3][j] = i;
+            }
+          }
+        }
+      }
+    }
+
+    // Prepare pos map for grad attrs_
+    for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+      auto end = std::find(attrs_names.begin(), attrs_names.end(),
+                           grad_attrs_names[i]);
+      PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                        paddle::platform::errors::NotFound(
+                            "All Grad attrs should be one of forward attrs and "
+                            "we got %s is not one of them, please check your "
+                            "op and change to fit the rule.",
+                            grad_attrs_names[i]));
+      for (size_t j = 0; j < attrs_names.size(); j++) {
+        if (grad_attrs_names[i] == attrs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
+                  << " attrs: " << attrs_names[j] << " related to No." << i
+                  << " grad_attrs: " << grad_attrs_names[i];
+          in_out_map[op_type][4][j] = i;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<paddle::any> CastAttrsToTragetType(
+    const std::vector<paddle::any>& src,
+    const std::vector<std::string>& attrs_names) {
+  std::vector<paddle::any> res;
+  PADDLE_ENFORCE_EQ(src.size(), attrs_names.size(),
+                    paddle::platform::errors::InvalidArgument(
+                        "We Expected same size of attrs and attrs_name list, "
+                        "if u got this error indicate your custom op setting "
+                        "%s attrs, but you just give %s",
+                        attrs_names.size(), src.size()));
+  for (size_t i = 0; i < src.size(); i++) {
+    size_t end = attrs_names[i].find(": ");
+    std::string type_name =
+        attrs_names[i].substr(end + 2, attrs_names.size() - end - 2);
+    if (type_name == "int") {
+      if (src[i].type() == typeid(bool)) {
+        res.emplace_back(static_cast<int>(paddle::any_cast<bool>(src[i])));
+      } else if (src[i].type() == typeid(int)) {
+        res.emplace_back(src[i]);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Your No. %s attrs should only can be bool or int32, other type is "
+            "forbidden for now but we got %s. Check your code first please",
+            i, src[i].type().name()));
+      }
+    } else if (type_name == "int64_t") {
+      if (src[i].type() == typeid(bool)) {
+        res.emplace_back(static_cast<int64_t>(paddle::any_cast<bool>(src[i])));
+      } else if (src[i].type() == typeid(int)) {
+        res.emplace_back(static_cast<int64_t>(paddle::any_cast<int>(src[i])));
+      } else if (src[i].type() == typeid(int64_t)) {
+        res.emplace_back(src[i]);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Your No. %s attrs should only can be bool or int32 or int64_t, "
+            "other type is forbidden for now but we got %s. Check your code "
+            "first please",
+            i, src[i].type().name()));
+      }
+    } else {
+      res.emplace_back(src[i]);
+    }
+  }
+  return res;
+}
+
+static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  paddle::CustomOpKernelContext ctx =
+      CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0);
+  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+  bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
+  VLOG(7) << "Get things for python for Custom Op: " << op_type
+          << ", trace_backward is: " << trace_backward;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  PADDLE_ENFORCE_NE(meta_info_map.find(op_type), meta_info_map.end(),
+                    paddle::platform::errors::NotFound(
+                        "Can't find %s in Eager OpMetaInfoMap which should be "
+                        "created by LoadOpMetaInfoAndRegisterOp, please make "
+                        "sure you registered your op first and try again. ",
+                        op_type));
+  VLOG(7) << "Run Kernel of Custom Op: " << op_type;
+  std::vector<paddle::any> res_attrs = CastAttrsToTragetType(
+      ctx.Attrs(), paddle::framework::OpMetaInfoHelper::GetAttrs(
+                       meta_info_map.at(op_type)[0]));
+  ctx.EmplaceBackAttrs(res_attrs);
+  const auto& vec_map = meta_info_map.at(op_type);
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
+
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+  if (require_any_grad) {
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type;
+    ConstructFwdAndBwdMap(vec_map, op_type);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type);
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      if (slot_map[0].find(i) != slot_map[0].end()) {
+        grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]);
+        grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]);
+      } else {
+        grad_node->SetGradOutMeta(&ins_auto_grad_metas[i],
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        grad_node->AddEdges(&ins_auto_grad_metas[i],
+                            ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen(
+          ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+    }
+
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type)[1]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = res_attrs[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_functions[] = {
+    // TODO(jiabin): Remove scale when we have final state tests
     {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_expected_place",
@@ -179,6 +452,8 @@ PyMethodDef variable_functions[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"read_next_tensor_list",
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f11a2ab2517fb481f184c9b68b2558c999d88ec9..082ec382c79cd9c98ac75db14bc552883088b885 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
@@ -30,10 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
 
 extern PyTypeObject* p_tensor_type;
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
+    paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
+    PADDLE_ENFORCE_EQ(
+        tensor.initialized(), true,
+        paddle::platform::errors::InvalidArgument(
+            "We can only support initialized tensor in slice, however we got "
+            "uninitialized tensor %s, please check your code.",
+            tensor.name()));
+    return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
+        CastPyArg2Tensor(obj, 0).impl().get())));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject* obj) {
+  return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
+}
+
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
@@ -188,8 +214,8 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
-  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
-  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
+  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor =
       self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-// NOTE(wuweilong): Set value and not change self's original place
-static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
-                                         PyObject* kwargs) {
+static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
   EAGER_TRY
-  VLOG(4) << "Value " << self->tensor.name();
-  pybind11::object numpy_value =
-      pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
-  InitTensorWithNumpyValue(self, numpy_value, false);
-  Py_INCREF(Py_None);
-  return Py_None;
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
+  VLOG(4) << "Call _getitem_index_not_tensor";
+  std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
+      decrease_axis, none_axes, infer_flags, list_select_idxs;
+  // if index is a list, list_select_flag will be true
+  bool list_select_flag = false;
+  PADDLE_ENFORCE_EQ(
+      self->tensor.is_initialized(), true,
+      platform::errors::InvalidArgument(
+          "tensor %s has not been initialized, we can only slice initialized "
+          "tensor please init it first with numpy or other tensor.",
+          self->tensor.name()));
+  auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+  ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
+                     &slice_strides, &decrease_axis, &none_axes, &infer_flags,
+                     &list_select_idxs, &list_select_flag);
+
+  auto out = slice_axes.empty() && !list_select_flag
+                 ? self->tensor
+                 : paddle::experimental::Tensor(
+                       egr::Controller::Instance().GenerateUniqueName());
+
+  if (!slice_axes.empty()) {
+    framework::AttributeMap attrs = {{"axes", slice_axes},
+                                     {"starts", slice_starts},
+                                     {"ends", slice_ends},
+                                     {"infer_flags", infer_flags},
+                                     {"decrease_axis", decrease_axis}};
+    std::string op_type = "slice";
+    for (auto stride : slice_strides) {
+      if (stride != 1) {
+        op_type = "strided_slice";
+        attrs.insert({"strides", slice_strides});
+        attrs.erase("decrease_axis");
+        break;
+      }
+    }
+    if (op_type == "slice") {
+      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(),
+                                   std::move(attrs));
+    } else if (op_type == "strided_slice") {
+      out = strided_slice_dygraph_function(self->tensor, attrs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Slice is only support slice and strided_slice, but we got %s which "
+          "is impossible, please check your code first or contact us by "
+          "issue. ",
+          op_type));
+    }
+  }
+
+  if (!none_axes.empty()) {
+    // Deal with cases when all axes are decreased.
+    // After slice, the shape of out is [1], which should have been
+    // [], but Paddle doesn't support scalar.
+    // In order to ensure the correctness of the final shape of out,
+    // one dimension of out needs to be decreased.
+    // For example:
+    // # x.shape: (2,3,4)
+    // out = x[0, 1, 1, None] # out.shape : (1)
+    if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
+      none_axes.pop_back();
+    }
+    if (!none_axes.empty()) {
+      // Deal with cases that decrease_axes is not empty
+      // For example:
+      // # x.shape: (2,3,4)
+      // out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+      for (auto& axis : none_axes) {
+        int len = 0;
+        for (int da : decrease_axis) {
+          if (da < axis) {
+            len++;
+          }
+        }
+        axis -= len;
+      }
+
+      paddle::experimental::Tensor new_out;
+      framework::AttributeMap attrs = {{"axes", none_axes}};
+      new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
+      return ToPyObject(new_out);
+    }
+  }
+
+  // the index is a list
+  if (list_select_flag) {
+    auto select_index = paddle::experimental::Tensor(
+        egr::Controller::Instance().GenerateUniqueName());
+    auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
+        egr::Controller::Instance().GetExpectedPlace());
+    paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
+                                        idx_tensor.get());
+    framework::AttributeMap attrs = {{"dim", 0}};
+    out = index_select_dygraph_function(self->tensor, select_index,
+                                        std::move(attrs));
+  }
+
+  return ToPyObject(out);
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -567,6 +688,21 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* set_grad_type(TensorObject* self, PyObject* args,
+                               PyObject* kwargs) {
+  EAGER_TRY
+  auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
+  auto grad_tensor =
+      egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    grad_tensor.set_impl(std::make_shared<phi::DenseTensor>());
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    grad_tensor.set_impl(std::make_shared<phi::SelectedRows>());
+  }
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -602,7 +738,8 @@ PyMethodDef variable_methods[] = {
     {"get_tensor",
      (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
+    {"_getitem_index_not_tensor",
+     (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_register_grad_hook",
      (PyCFunction)(void (*)(void))tensor_register_grad_hook,
@@ -612,6 +749,8 @@ PyMethodDef variable_methods[] = {
     {"_register_backward_hook",
      (PyCFunction)(void (*)(void))tensor_register_reduce_hook,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index c15c171799f4421fc3e8b40a84abdbb062709dc7..102cdbb91ab066c4a6d499688bca30c1c3d185ad 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <set>
 #include <string>
+#include <unordered_set>
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -129,6 +130,12 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
 
+// These operators will skip automatical code generatrion and
+// need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE
+std::unordered_set<std::string> CUSTOM_HANDWRITE_OPS_SET = {"run_program"};
+const char* CUSTOM_HANDWRITE_OP_FUNC_FILE =
+  "#include \"paddle/fluid/pybind/custom_handwrite_op_funcs.h\"\n";
+
 // clang-format on
 static inline bool FindInsMap(const std::string& op_type,
                               const std::string& in_name) {
@@ -355,7 +362,7 @@ GenerateOpFunctions() {
 
   std::vector<std::string> op_function_list, bind_function_list;
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-
+  bool append_custom_head_file = false;
   for (auto& pair : op_info_map) {
     auto& op_info = pair.second;
     auto op_proto = op_info.proto_;
@@ -363,7 +370,12 @@ GenerateOpFunctions() {
       continue;
     }
     auto& op_type = op_proto->type();
-    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE.
+    if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) {
+      append_custom_head_file = true;
+      continue;
+    }
+    // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
     // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
@@ -380,6 +392,9 @@ GenerateOpFunctions() {
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
   }
+  if (append_custom_head_file) {
+    op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
+  }
   return std::make_tuple(op_function_list, bind_function_list);
 }
 
@@ -449,6 +464,11 @@ int main(int argc, char* argv[]) {
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < "
+         "0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
       << "}\n\n"
       << "} // namespace pybind\n"
       << "} // namespace paddle\n";
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 2e1390cb96155c4832a8ceace889e331039ed43f..2572866b8f5198b2414163d4198e06b54d11fedc 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -72,7 +72,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
   VLOG(6) << "Get grad for tensor: " << self->tensor.name();
   auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
-  if (meta) {
+  if (meta && meta->Grad().initialized()) {
     return ToPyObject(meta->Grad());
   } else {
     Py_INCREF(Py_None);
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c1e8822eec22179266d69d3b97890aebe678b187..217edad0c0a105cc649c6c8c4433b0c8eab0119b 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -16,22 +16,27 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 namespace paddle {
 namespace pybind {
 
 extern PyTypeObject* p_tensor_type;
 
+extern PyTypeObject* g_framework_scope_pytype;
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_place_pytype;
 extern PyTypeObject* g_cudaplace_pytype;
@@ -41,6 +46,7 @@ extern PyTypeObject* g_npuplace_pytype;
 extern PyTypeObject* g_cudapinnedplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
+extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
 
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
   switch (dtype) {
@@ -179,11 +185,16 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerVariable, but got %s",
+        "Tensor, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
 
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos) {
+  return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
+}
+
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos) {
   std::vector<paddle::experimental::Tensor> result;
@@ -309,7 +320,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
-        "EagerVariable, but got %s",
+        "DenseTensor, but got %s",
         arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
   }
 }
@@ -381,6 +392,19 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
   return dtype;
 }
 
+paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
+                                                              ssize_t arg_pos) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_custom_op_kernel_ctx_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), "
+        "but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
 PyObject* ToPyObject(bool value) {
   if (value) {
     Py_INCREF(Py_True);
@@ -579,14 +603,9 @@ paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
       reinterpret_cast<TensorObject*>(obj)->tensor);
 }
 
-// For Intermediate State Dygraph,
-// we use an uninitialized Tensor to represent dispensable Tensor
-paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
-                                                const std::string& arg_name,
-                                                PyObject* args, ssize_t arg_idx,
-                                                bool dispensable) {
-  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
-
+static paddle::experimental::Tensor& GetTensorFromPyObject(
+    const std::string& op_type, const std::string& arg_name, PyObject* obj,
+    ssize_t arg_idx, bool dispensable) {
   if (PyTuple_Check(obj)) {
     obj = PyTuple_GET_ITEM(obj, 0);
   }
@@ -604,6 +623,16 @@ paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
   return reinterpret_cast<TensorObject*>(obj)->tensor;
 }
 
+// For Intermediate State Dygraph,
+// we use an uninitialized Tensor to represent dispensable Tensor
+paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
+                                                const std::string& arg_name,
+                                                PyObject* args, ssize_t arg_idx,
+                                                bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  return GetTensorFromPyObject(op_type, arg_name, obj, arg_idx, dispensable);
+}
+
 std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable) {
@@ -737,5 +766,181 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
 
   return result;
 }
+
+paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int, float, bool or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // obj could be: int, float, bool, paddle.Tensor
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "int") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (type_name == "float") {
+    float value = CastPyArg2Float(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+
+  } else if (type_name == "bool") {
+    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+
+  } else if (type_name == "paddle.Tensor") {
+    paddle::experimental::Tensor& value = GetTensorFromPyObject(
+        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
+    return paddle::experimental::Scalar(value);
+
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int, float, bool or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // Fake a Scalar
+  return paddle::experimental::Scalar(1.0);
+}
+
+paddle::experimental::ScalarArray CastPyArg2ScalarArray(
+    PyObject* obj, const std::string& op_type, ssize_t arg_pos) {
+  // In case of ScalarArray, only two possible PyObjects:
+  // 1. list of int
+  // 2. Tensor
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // obj could be: int, float, bool, paddle.Tensor
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "list") {
+    std::vector<int> value = CastPyArg2Ints(obj, op_type, arg_pos);
+    return paddle::experimental::ScalarArray(value);
+
+  } else if (type_name == "paddle.Tensor") {
+    paddle::experimental::Tensor& value = GetTensorFromPyObject(
+        op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
+    return paddle::experimental::ScalarArray(value);
+
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or Tensor, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  // Fake a ScalarArray
+  return paddle::experimental::ScalarArray({1});
+}
+
+paddle::framework::Scope* CastPyArg2ScopePtr(PyObject* obj) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_framework_scope_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::framework::Scope*>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "PyObject can not be cast into framework::Scope"));
+  }
+}
+
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+  }
+
+  std::vector<paddle::framework::Scope*> result;
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (list == Py_None) {
+    return {};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+  return result;
+}
+
+paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
+                                                const std::string& op_type,
+                                                ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int or place, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  if (type_name == "int") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return static_cast<paddle::experimental::Backend>(value);
+  } else {
+    platform::Place place = CastPyArg2Place(obj, arg_pos);
+    return phi::TransToPhiBackend(place);
+  }
+
+  return paddle::experimental::Backend::CPU;
+}
+
+paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
+                                                  const std::string& op_type,
+                                                  ssize_t arg_pos) {
+  if (obj == Py_None) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "data_type, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+
+  framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
+  return framework::TransToPhiDataType(type);
+}
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 0c721d6124791edda7f41d46dcbbbfcccc80fb95..2187555e1c3c7f64bd864e4212bfc6ebe1fb1684 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -11,11 +11,19 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-
 namespace paddle {
+class CustomOpKernelContext;
+namespace framework {
+class Scope;
+}
 namespace pybind {
 
 typedef struct {
@@ -32,7 +40,11 @@ int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
 int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
+paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
+                                                              ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos);
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
@@ -42,6 +54,7 @@ std::vector<framework::LoDTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
+
 PyObject* ToPyObject(int value);
 PyObject* ToPyObject(bool value);
 PyObject* ToPyObject(int64_t value);
@@ -89,6 +102,21 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos);
+
+paddle::experimental::ScalarArray CastPyArg2ScalarArray(
+    PyObject* obj, const std::string& op_type, ssize_t arg_pos);
+
+paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
+                                                const std::string& op_type,
+                                                ssize_t arg_pos);
+
+paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
+                                                  const std::string& op_type,
+                                                  ssize_t arg_pos);
+
 paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
@@ -112,5 +140,11 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
+// end of Slice related methods
+
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 362a3e44fab6254bef591bfd144e071821846271..4f25a6f1a5ca8d1a7926d148830934370e323e0f 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/exception.h"
-
+#include "paddle/phi/api/ext/exception.h"
 namespace paddle {
 namespace pybind {
 
@@ -122,6 +122,8 @@ void ThrowExceptionToPython(std::exception_ptr p) {
         PyErr_SetString(EnforceNotMetException, e.what());
         break;
     }
+  } catch (const paddle::PD_Exception& e) {
+    PyErr_SetString(PyExc_OSError, e.what());
   }
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8c5ed2d11830195a6fb70c54d12c9ef3eb3fc8b2..9b373a58181f165b52fe809b817977a076b7d961 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -54,7 +54,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace pybind {
@@ -319,6 +321,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
   }
 }
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
+  if (py::isinstance<imperative::VarBase>(obj)) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
+    return GetSliceIndexFromTensor(
+        py::cast<std::shared_ptr<imperative::VarBase>>(obj)
+            ->Var()
+            .Get<framework::LoDTensor>());
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject *obj) {
+  return py::isinstance<imperative::VarBase>(obj);
+}
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
@@ -360,18 +379,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
 
   return result;
 }
-static bool IsNumpyType(PyObject *obj) {
-  // It is not a good way to judge the type of obj by its type'name. Maybe using
-  // `PyArray_IsScalar` will be better. However, this interface cannot be used
-  // by including pybind11, and it needs to compile with numpy.
-  auto type_name = std::string(Py_TYPE(obj)->tp_name);
-  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
-         type_name == "numpy.int32" || type_name == "numpy.int16";
-}
-
-static bool PyCheckTensor(PyObject *obj) {
-  return py::isinstance<imperative::VarBase>(obj);
-}
 
 // cast numpy type form S to T, this may allocate new memory
 template <class T, class S>
@@ -429,260 +436,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
-static bool PyCheckInteger(PyObject *obj) {
-#if PY_VERSION_HEX < 0x03000000
-  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
-#else
-  return PyLong_Check(obj) && !PyBool_Check(obj);
-#endif
-}
-
-static Py_ssize_t GetSliceIndexFromTensor(
-    const std::shared_ptr<imperative::VarBase> &tensor_index) {
-  const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
-  if (tensor.numel() == 1) {
-    if (framework::TransToProtoVarType(tensor.dtype()) ==
-        framework::proto::VarType::INT32) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
-    } else if (framework::TransToProtoVarType(tensor.dtype()) ==
-               framework::proto::VarType::INT64) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, the type of tensor in slice indices only allows "
-          "int32 and int64, please check the type of index tensor."));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Currently, tensor in slice indices only allows 1 element, "
-        "but received %d.",
-        tensor.numel()));
-  }
-}
-
-// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
-// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
-// Original PySlice_GetIndices return wrong result when
-// slice_item contains long int, such as arr[:180L].
-// NOT sure why this happens !!!
-// Besides, PySlice_GetIndices cannot raise error when float in slice item.
-// So, I make a revised version of PySlice_GetIndices, named to
-// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
-// PySlice_GetIndices in the future.
-static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
-                               Py_ssize_t *start, Py_ssize_t *stop,
-                               Py_ssize_t *step) {
-  /* XXX support long ints */
-  if (r->step == Py_None) {
-    *step = 1;
-  } else {
-    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
-      *step = PyLong_AsLong(r->step);
-    } else if (PyCheckTensor(r->step)) {
-      *step = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->step)->tp_name)));
-    }
-  }
-  if (r->start == Py_None) {
-    *start = *step < 0 ? length - 1 : 0;
-  } else {
-    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
-      *start = PyLong_AsLong(r->start);
-    } else if (PyCheckTensor(r->start)) {
-      *start = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->start)->tp_name)));
-    }
-    if (*start < 0) *start += length;
-    *start = std::max(*start, static_cast<Py_ssize_t>(0));
-  }
-  if (r->stop == Py_None) {
-    *stop = *step < 0 ? -1 : length;
-  } else {
-    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
-      *stop = PyLong_AsLong(r->stop);
-    } else if (PyCheckTensor(r->stop)) {
-      *stop = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->stop)->tp_name)));
-    }
-    if (0 < *step && *stop < 0) *stop += length;
-    *stop = std::min(*stop, length);
-  }
-  if (*stop > length) return -1;
-  if (*start >= length) return -1;
-  if (*step == 0) return -1;
-  return 0;
-}
-
-static void ParseIndexingSlice(
-    framework::LoDTensor *tensor, PyObject *_index,
-    std::vector<int> *slice_axes, std::vector<int> *slice_starts,
-    std::vector<int> *slice_ends, std::vector<int> *slice_strides,
-    std::vector<int> *decrease_axis, std::vector<int> *none_axes,
-    std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
-    bool *list_select_flag) {
-  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
-  // types, and list of Bool and Integers.
-  // wrap to tuple
-
-  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
-  PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
-  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
-    if (!PyTuple_Check(_index)) {
-      Py_DECREF(index);
-      VLOG(4) << "Call Py_DECREF";
-    }
-  });
-  PADDLE_ENFORCE_EQ(
-      tensor->IsInitialized(), true,
-      platform::errors::InvalidArgument("tensor has not been initialized"));
-  const auto &shape = tensor->dims();
-  const int rank = shape.size();
-  const int size = PyTuple_GET_SIZE(index);
-
-  // specified_dims is the number of dimensions which indexed by Interger,
-  // Slices.
-  int specified_dims = 0;
-  int ell_count = 0;
-  for (int dim = 0; dim < size; ++dim) {
-    PyObject *slice_item = PyTuple_GetItem(index, dim);
-    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
-      specified_dims++;
-    } else if (slice_item == Py_Ellipsis) {
-      ell_count++;
-    }
-  }
-
-  PADDLE_ENFORCE_LE(ell_count, 1,
-                    platform::errors::InvalidArgument(
-                        "An index can only have a single ellipsis ('...')"));
-  int none_count = 0;
-  for (int i = 0, dim = 0; i < size; ++i) {
-    PyObject *slice_item = PyTuple_GetItem(index, i);
-
-    infer_flags->push_back(1);
-    int dim_len = shape[dim];
-    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
-      // integer, PyLong_AsLong supports both int and long
-      int start = static_cast<int>(PyLong_AsLong(slice_item));
-      auto s_t = start;
-      start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(start + 1);
-      slice_strides->push_back(1);
-      decrease_axis->push_back(dim);
-      dim++;
-    } else if (PySlice_Check(slice_item)) {
-      // slice item
-      Py_ssize_t start, end, step;
-      PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
-      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
-
-      // :: or : or 0:dim_len:1
-      if (start == 0 && end == dim_len && step == 1) {
-        dim++;
-        continue;
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(end);
-      slice_strides->push_back(step);
-      dim++;
-    } else if (slice_item == Py_Ellipsis) {
-      dim += rank - specified_dims;
-    } else if (slice_item == Py_None) {
-      none_axes->push_back(dim + none_count);
-      none_count++;
-    } else if (PyList_Check(slice_item)) {
-      *list_select_flag = true;
-      PADDLE_ENFORCE_EQ(
-          size, 1,
-          platform::errors::InvalidArgument(
-              "When index contains a list, its length is excepted to 1, "
-              "but received %d",
-              size));
-      bool all_bool = true;
-      int list_size = PyList_GET_SIZE(slice_item);
-      for (int j = 0; j < list_size; ++j) {
-        PyObject *list_item = PyList_GetItem(slice_item, j);
-        if (PyCheckInteger(list_item)) {
-          all_bool = false;
-        } else if (!PyBool_Check(list_item)) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support int or bool in index list."));
-        }
-      }
-      if (all_bool) {
-        PADDLE_ENFORCE_EQ(
-            list_size, shape[0],
-            platform::errors::InvalidArgument(
-                "The dimension of bool index doesn't match indexed array along "
-                "dimension 0, the target dimension is %d, but received %d.",
-                shape[0], list_size));
-
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (list_item == Py_True) {
-            list_select_idxs->push_back(j);
-          }
-        }
-      } else {
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (PyCheckInteger(list_item)) {
-            list_select_idxs->push_back(
-                static_cast<int>(PyLong_AsLong(list_item)));
-          } else if (list_item == Py_True) {
-            list_select_idxs->push_back(1);
-          } else {
-            list_select_idxs->push_back(0);
-          }
-        }
-      }
-
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, Tensor.__indices__() only allows indexing "
-          "by Integers, Slices, Ellipsis, None, tuples of these types "
-          "and list of Bool and Integers, but received "
-          "%s in %dth slice item",
-          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
-    }
-  }
-
-  // valid_index is the number of dimensions exclude None index
-  const int valid_indexs = size - none_axes->size() - ell_count;
-  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
-                    platform::errors::InvalidArgument(
-                        "Too many indices (%d) for tensor of dimension %d.",
-                        valid_indexs, rank));
-}
-
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
@@ -2321,6 +2074,26 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
+      .def("_get_kernel_signature",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs) {
+             // TODO(xiongkun): move this function outside of tracer.
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               auto to_vector = [](paddle::SmallVector<std::string> &vec) {
+                 return std::vector<std::string>(vec.begin(), vec.end());
+               };
+               auto ret = self.GetExpectedKernelSignature(type, ins_map,
+                                                          outs_map, attrs);
+               auto kernelsig_ins = to_vector(std::get<0>(ret.args));
+               auto kernelsig_attrs = to_vector(std::get<1>(ret.args));
+               auto kernelsig_outs = to_vector(std::get<2>(ret.args));
+               return std::make_tuple(kernelsig_ins, kernelsig_attrs,
+                                      kernelsig_outs);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 9b5041154c95a8555e93a8995d2c230cd537f71b..b008308e27d9afaa9d8c47290489d50a762f2a41 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -33,6 +33,10 @@
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
+
 namespace py = pybind11;
 
 namespace pybind11 {
@@ -556,6 +560,10 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
+      .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
+      .def("onnxruntime_enabled", &AnalysisConfig::use_onnxruntime)
+      .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
       .def("use_npu", &AnalysisConfig::use_npu)
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index f0d5a4e477fe47e755c08baa9611d04eea3cc78c..8d78adaf5a4735d87e2206df6c8b55875db68118 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -44,35 +44,41 @@ int main(int argc, char **argv) {
   paddle::framework::InitDefaultKernelSignatureMap();
   auto &kernel_signature_map = phi::DefaultKernelSignatureMap::Instance();
   auto &kernel_factory = phi::KernelFactory::Instance();
-  std::cout << "{";
+  std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
     if (kernel_signature_map.Has(op_kernel_pair.first)) {
-      std::cout << "\"" << op_kernel_pair.first << "\":{";
+      kernel_signature_map_str =
+          kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
       auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
 
-      std::cout << "\"inputs\":[";
+      kernel_signature_map_str += "\"inputs\":[";
       auto inputs_ = std::get<0>(args);
-      if (inputs_.size() > 0) std::cout << inputs_[0];
-      for (size_t i = 1; i < inputs_.size(); i++) {
-        std::cout << ",\"" << inputs_[i] << "\"";
+      for (size_t i = 0; i < inputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + inputs_[i] + "\",";
       }
+      if (inputs_.size()) kernel_signature_map_str.pop_back();
 
-      std::cout << "],\"attrs\":[";
+      kernel_signature_map_str += "],\"attrs\":[";
       auto attrs_ = std::get<1>(args);
-      if (attrs_.size() > 0) std::cout << attrs_[0];
-      for (size_t i = 1; i < attrs_.size(); i++) {
-        std::cout << ",\"" << attrs_[i] << "\"";
+      for (size_t i = 0; i < attrs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + attrs_[i] + "\",";
       }
-
-      std::cout << "],\"outputs\":[";
+      if (attrs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "],\"outputs\":[";
       auto outputs_ = std::get<2>(args);
-      for (size_t i = 1; i < outputs_.size(); i++) {
-        std::cout << ",\"" << outputs_[i] << "\"";
+      for (size_t i = 0; i < outputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + outputs_[i] + "\",";
       }
 
-      std::cout << "]},";
+      if (outputs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "]},";
     }
   }
-  std::cout << "}" << std::endl;
+  kernel_signature_map_str.pop_back();
+  kernel_signature_map_str += "}\n";
+  std::cout << kernel_signature_map_str;
   return 0;
 }
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index d23b3dd64ab05cf10d8096a84e317645972211d1..9e86e3df8a6884ec1b75b8525ad858ff8f2e233c 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -30,8 +30,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
     {"bincount", {"X", "Weights"}},
     {"fused_attention",
-     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
-      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
+      "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -104,11 +104,16 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
-    {"fused_attention",
-     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
-      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
-      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
-      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
+    {"fused_attention", {"LnMean",         "LnVariance",
+                         "LnOut",          "QKVOut",
+                         "QKVBiasOut",     "TransposeOut2",
+                         "QKOut",          "QKTVOut",
+                         "SoftmaxOut",     "AttnDropoutMaskOut",
+                         "AttnDropoutOut", "SrcMaskOut",
+                         "FMHAOut",        "OutLinearOut",
+                         "DropoutMaskOut", "Ln2Mean",
+                         "Ln2Variance",    "BiasDropoutResidualOut",
+                         "CacheKVOut",     "Y"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2d9272dd0ed27b7b6e129a8b4c1c689cd0442235..21bbc7f3e369bf66935487d3f3619c9a0890399b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -63,6 +64,9 @@ limitations under the License. */
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
@@ -77,9 +81,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
@@ -101,7 +109,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/metrics_py.h"
@@ -157,6 +164,9 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_py.h"
 #endif
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "pybind11/stl.h"
 
 DECLARE_bool(use_mkldnn);
@@ -171,6 +181,7 @@ namespace paddle {
 namespace pybind {
 
 PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_framework_scope_pytype = nullptr;
 PyTypeObject *g_cudaplace_pytype = nullptr;
 PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
@@ -179,6 +190,7 @@ PyTypeObject *g_cudapinnedplace_pytype = nullptr;
 PyTypeObject *g_mluplace_pytype = nullptr;
 PyTypeObject *g_framework_tensor_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
+PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
 
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
@@ -527,6 +539,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+  BindImperative(&m);
   BindEager(&m);
   BindCudaStream(&m);
 
@@ -723,6 +736,13 @@ PYBIND11_MODULE(core_noavx, m) {
                lib[string]: the libarary, could be 'phi', 'fluid' and 'all'.
            )DOC");
 
+  // NOTE(Aganlengzi): KernelFactory static instance is initialized BEFORE
+  // plugins are loaded for custom kernels, but de-initialized AFTER they are
+  // unloaded. We need manually clear symbols(may contain plugins' symbols)
+  // stored in this static instance to avoid illegal memory access.
+  m.def("clear_kernel_factory",
+        []() { phi::KernelFactory::Instance().kernels().clear(); });
+
   // NOTE(zjl): ctest would load environment variables at the beginning even
   // though we have not `import paddle.fluid as fluid`. So we add this API
   // to enable eager deletion mode in unittest.
@@ -741,7 +761,56 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
-  BindImperative(&m);
+  py::class_<paddle::CustomOpKernelContext> custom_op_kernel_ctx(
+      m, "CustomOpKernelContext", R"DOC()DOC");
+  g_custom_op_kernel_ctx_pytype =
+      reinterpret_cast<PyTypeObject *>(custom_op_kernel_ctx.ptr());
+  custom_op_kernel_ctx.def(py::init<>())
+      .def("add_inputs",
+           [](paddle::CustomOpKernelContext &self, const py::handle &input) {
+             PyObject *obj = input.ptr();
+             if (PyList_Check(obj) || PyTuple_Check(obj)) {
+               self.EmplaceBackInputs(
+                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
+             } else {
+               self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1)));
+             }
+           })
+      .def("add_outputs",
+           [](paddle::CustomOpKernelContext &self, py::handle &outputs) {
+             PyObject *obj = outputs.ptr();
+             if (PyList_Check(obj) || PyTuple_Check(obj)) {
+               self.EmplaceBackOutputs(
+                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
+             } else {
+               self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
+             }
+           })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          bool attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          int attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          float attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          int64_t attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, const std::string &attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<int> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<float> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+                          const std::vector<std::string> &attr) {
+        self.EmplaceBackAttr(attr);
+      });
 
   py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                  py::buffer_protocol());
@@ -1176,6 +1245,287 @@ PYBIND11_MODULE(core_noavx, m) {
            });
 #else
            })
+#ifdef PADDLE_WITH_CUDA
+      .def("_share_buffer_with",
+           [](framework::Tensor &self, const framework::Tensor src,
+              py::tuple t) {
+             auto *cuda_ipc_allocation =
+                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
+                     src.Holder().get());
+
+             PADDLE_ENFORCE_NOT_NULL(
+                 cuda_ipc_allocation,
+                 platform::errors::PreconditionNotMet(
+                     "Tensor is not Cuda IPC shared tensor. "
+                     "Now only Tensor shared by cuda ipc could use this "
+                     "api."));
+
+             size_t size = t[0].cast<size_t>();
+             auto dtype =
+                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
+             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
+             auto lod_info = t[3].cast<framework::LoD>();
+             auto device_id = t[4].cast<int>();
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::Allocation>(
+                     cuda_ipc_allocation->ptr(),
+                     cuda_ipc_allocation->base_ptr(), size,
+                     platform::CUDAPlace(device_id));
+
+             self.ResetHolderWithType(shared_reader_holder, dtype);
+             self.Resize(dims);
+             self.set_lod(lod_info);
+
+             VLOG(6) << "Reconstructed tensor with buffer shared!";
+           },
+           R"DOC(
+           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
+
+           Params:
+               tensor: Shared Cuda IPC tensor.
+               tuple: contrains data size, data type,
+                      tensor dims, lod information, device index.
+
+       )DOC")
+      .def("_share_cuda",
+           [](framework::Tensor self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0.  could not pass "
+                   "to shared memory. ");
+
+             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
+                 self.Holder().get());
+             PADDLE_ENFORCE_EQ(
+                 platform::is_gpu_place(holder->place()), true,
+                 platform::errors::InvalidArgument(
+                     "Tensor is not on GPU. share_cuda only support GPU "
+                     "Tensor, share_filename is for CPU tensor."));
+
+             void *base_ptr = holder->base_ptr();
+             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
+                                      reinterpret_cast<char *>(base_ptr);
+
+             cudaIpcMemHandle_t handle;
+             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
+
+             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
+                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
+
+             // TODO(ZHUI): use cuda event, to avoid sync.
+             const auto &device_id = paddle::platform::GetCurrentDeviceId();
+             auto stream =
+                 paddle::platform::stream::get_current_stream(device_id);
+             stream->Synchronize();
+
+             int type_idx = static_cast<int>(self.type());
+             size_t data_size =
+                 self.numel() *
+                 framework::SizeOfType(
+                     framework::TransToProtoVarType(self.type()));
+
+             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
+                                   type_idx, vectorize(self.dims()), self.lod(),
+                                   device_id);
+           },
+           R"DOC(
+           Serialize GPU Tensor by cudaIpcMemHandle.
+
+           Returns:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+
+      )DOC")
+      .def("_new_shared_cuda",
+           [](py::tuple t) {
+             if (t.size() != 7)
+               throw std::runtime_error(
+                   "Invalid Tensor meta info for shared cuda tensor!");
+
+             // 1. Create a new C++ instance
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation from handle
+             const std::string &handle = t[0].cast<std::string>();
+             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
+             auto device_id = t[6].cast<int>();
+             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
+             size_t size = t[2].cast<size_t>();
+             void *dev = base_ptr.get();
+             dev = reinterpret_cast<char *>(dev) + offset_bytes;
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::CudaIpcAllocation>(
+                     dev, size, device_id, std::move(base_ptr));
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_reader_holder,
+                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.set_lod(t[5].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize GPU lod tensor from cudaIpcMemHandle.
+
+           Params:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
+
+        )DOC")
+#endif
+      .def("_share_filename",
+           [](framework::Tensor &self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0. could not pass to "
+                   "shared memory. ");
+
+             auto holder = self.Holder();
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(holder->place()) ||
+                     platform::is_cuda_pinned_place(holder->place()),
+                 true, platform::errors::InvalidArgument(
+                           "Tensor is not on CPU. share_filename only "
+                           "support CPU Tensor."));
+
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 holder.get());
+             // If the tensor is not shared, allocate memory map allocation.
+             if (mmap_allocation == nullptr) {
+               void *data_ptr = self.data();
+               size_t data_size =
+                   self.numel() *
+                   framework::SizeOfType(
+                       framework::TransToProtoVarType(self.type()));
+
+               int flags = memory::allocation::MAPPED_SHAREDMEM |
+                           memory::allocation::MAPPED_EXCLUSIVE;
+               std::string handle = memory::allocation::GetIPCName();
+               auto shared_holder =
+                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                       handle, flags, data_size);
+
+               // copy data & reset holder
+               if (platform::is_cuda_pinned_place(holder->place())) {
+#ifdef PADDLE_WITH_CUDA
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CUDAPinnedPlace(), data_ptr, data_size);
+#endif
+               } else {
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CPUPlace(), data_ptr, data_size);
+               }
+               self.ResetHolder(shared_holder);
+               mmap_allocation = shared_holder.get();
+             }
+             int type_idx = static_cast<int>(self.type());
+
+             return py::make_tuple(mmap_allocation->ipc_name(),
+                                   mmap_allocation->size(), type_idx,
+                                   vectorize(self.dims()), self.lod());
+           },
+           R"DOC(
+           Serialize CPU lod tensor in shared memory to tuple.
+           If the tensor is not in shared memory, we will copy it first.
+
+           Returns:
+               tuple: contrains ipc name, data size, data type,
+                      tensor dims and lod imformation.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+
+       )DOC")
+      .def("_new_shared_filename",
+           [](py::tuple t) {  // __setstate__
+             if (t.size() != 5)
+               throw std::runtime_error("Invalid Tensor meta info state!");
+
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation
+             const std::string &ipc_name = t[0].cast<std::string>();
+             size_t size = t[1].cast<size_t>();
+             int flags = memory::allocation::MAPPED_SHAREDMEM |
+                         memory::allocation::MAPPED_NOCREATE;
+
+             auto shared_holder =
+                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                     ipc_name, flags, size);
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_holder,
+                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+             tensor.set_lod(t[4].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize CPU lod tensor from shared memory.
+
+           Params:
+               tuple: contrains ipc file name, data size, data type,
+                      tensor dims and lod information.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
+
+        )DOC")
+      .def("_shared_incref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->incref();
+             }
+           },
+           R"DOC(
+            Increase reference count of share_filename tensor.
+      )DOC")
+      .def("_shared_decref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->decref();
+             }
+           },
+           R"DOC(
+            Decrease reference count of share_filename tensor.
+      )DOC")
       .def(py::pickle(
           [](const framework::Tensor &t) {  // __getstate__
             auto holder = t.Holder();
@@ -1349,7 +1699,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindReader(&m);
 
-  py::class_<Scope>(m, "_Scope", R"DOC(
+  py::class_<Scope> _Scope(m, "_Scope", R"DOC(
     Scope is an association of a name to Variable. All variables belong to Scope.
 
     Variables in a parent scope can be retrieved from local scope.
@@ -1369,7 +1719,9 @@ All parameter, weight, gradient are variables in Paddle.
           param_array = np.full((height, row_numel), 5.0).astype("float32")
           param.set(param_array, place)
 
-        )DOC")
+        )DOC");
+  g_framework_scope_pytype = reinterpret_cast<PyTypeObject *>(_Scope.ptr());
+  _Scope
       .def("_remove_from_pool",
            [](Scope &self) { ScopePool::Instance().Remove(&self); })
       .def("var",
@@ -1669,7 +2021,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_all_device_type", []() {
     std::vector<std::string> device_types;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    device_types = platform::DeviceManager::GetAllDeviceTypes();
+    device_types = phi::DeviceManager::GetAllDeviceTypes();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_all_device_type because you have installed"
@@ -1683,7 +2035,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_all_custom_device_type", []() {
     std::vector<std::string> device_types;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_all_custom_device_type because you have installed"
@@ -1697,7 +2049,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_available_device", [] {
     std::vector<std::string> devices;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    devices = platform::DeviceManager::GetAllDeviceList();
+    devices = phi::DeviceManager::GetAllDeviceList();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_available_device because you have installed"
@@ -1711,7 +2063,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_available_custom_device", [] {
     std::vector<std::string> devices;
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    devices = platform::DeviceManager::GetAllCustomDeviceList();
+    devices = phi::DeviceManager::GetAllCustomDeviceList();
 #else
           LOG(WARNING) << string::Sprintf(
               "Cannot use get_available_custom_device because you have "
@@ -1748,10 +2100,10 @@ All parameter, weight, gradient are variables in Paddle.
                std::exit(-1);
              }
 
-             if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) &&
-                        platform::DeviceManager::IsCustom(device_type))) {
+             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
+                        phi::DeviceManager::IsCustom(device_type))) {
                int dev_count = static_cast<int>(
-                   platform::DeviceManager::GetDeviceCount(device_type));
+                   phi::DeviceManager::GetDeviceCount(device_type));
                if (UNLIKELY(dev_id >= dev_count)) {
                  if (dev_count == 0) {
                    LOG(ERROR) << "Cannot use " << device_type
@@ -1951,10 +2303,17 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
   m.def("get_xpu_device_op_support_types",
         [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
           return platform::get_xpu_op_support_type(op_name, version);
         });
+#endif
   m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
@@ -2523,10 +2882,11 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("load_op_meta_info_and_register_op",
-        framework::LoadOpMetaInfoAndRegisterOp);
+  m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) {
+    egr::Controller::Instance().MergeOpMetaInfoMap(
+        framework::LoadOpMetaInfoAndRegisterOp(dso_name));
+  });
   m.def("init_devices", []() { framework::InitDevices(); });
-
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
@@ -2913,6 +3273,88 @@ All parameter, weight, gradient are variables in Paddle.
   });
 
   m.def("size_of_dtype", framework::SizeOfType);
+  py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
+      .def(py::init<>())
+      .def("get_data", &paddle::platform::ProfilerResult::GetData,
+           py::return_value_policy::automatic_reference)
+      .def("save", &paddle::platform::ProfilerResult::Save)
+      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
+
+  py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
+      .def_readwrite("type", &paddle::platform::DevicePythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::DevicePythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::DevicePythonNode::end_ns)
+      .def_readwrite("device_id",
+                     &paddle::platform::DevicePythonNode::device_id)
+      .def_readwrite("context_id",
+                     &paddle::platform::DevicePythonNode::context_id)
+      .def_readwrite("stream_id",
+                     &paddle::platform::DevicePythonNode::stream_id);
+
+  py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::HostPythonNode::name)
+      .def_readwrite("type", &paddle::platform::HostPythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::HostPythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::HostPythonNode::end_ns)
+      .def_readwrite("process_id",
+                     &paddle::platform::HostPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("children_node",
+                     &paddle::platform::HostPythonNode::children_node_ptrs)
+      .def_readwrite("runtime_node",
+                     &paddle::platform::HostPythonNode::runtime_node_ptrs)
+      .def_readwrite("device_node",
+                     &paddle::platform::HostPythonNode::device_node_ptrs);
+
+  py::class_<paddle::platform::Profiler>(m, "_Profiler")
+      .def("create", &paddle::platform::Profiler::Create,
+           py::return_value_policy::take_ownership)
+      .def("prepare",
+           [](paddle::platform::Profiler *profiler) {
+             platform::EnableHostEventRecorder();
+             profiler->Prepare();
+           })
+      .def("start", &paddle::platform::Profiler::Start)
+      .def("stop",
+           [](paddle::platform::Profiler *profiler) {
+             platform::DisableHostEventRecorder();
+             return profiler->Stop();
+           },
+           py::return_value_policy::automatic_reference);
+
+  py::class_<paddle::platform::ProfilerOptions>(m, "ProfilerOptions")
+      .def(py::init<>())
+      .def_readwrite("trace_switch",
+                     &paddle::platform::ProfilerOptions::trace_switch);
+
+  py::class_<platform::RecordEvent>(m, "_RecordEvent")
+      .def(py::init([](std::string name, platform::TracerEventType type) {
+        return std::make_unique<platform::RecordEvent>(
+            name, type, 1, paddle::platform::EventRole::kOrdinary);
+      }))
+      .def("end", [](platform::RecordEvent *event) { event->End(); });
+
+  py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
+      .value("Operator", paddle::platform::TracerEventType::Operator)
+      .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
+      .value("ProfileStep", paddle::platform::TracerEventType::ProfileStep)
+      .value("CudaRuntime", paddle::platform::TracerEventType::CudaRuntime)
+      .value("Kernel", paddle::platform::TracerEventType::Kernel)
+      .value("Memcpy", paddle::platform::TracerEventType::Memcpy)
+      .value("Memset", paddle::platform::TracerEventType::Memset)
+      .value("UserDefined", paddle::platform::TracerEventType::UserDefined)
+      .value("OperatorInner", paddle::platform::TracerEventType::OperatorInner)
+      .value("Forward", paddle::platform::TracerEventType::Forward)
+      .value("Backward", paddle::platform::TracerEventType::Backward)
+      .value("Optimization", paddle::platform::TracerEventType::Optimization)
+      .value("Communication", paddle::platform::TracerEventType::Communication)
+      .value("PythonOp", paddle::platform::TracerEventType::PythonOp)
+      .value("PythonUserDefined",
+             paddle::platform::TracerEventType::PythonUserDefined);
+  m.def("load_profiler_result", &paddle::platform::LoadProfilerResult);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
@@ -3441,6 +3883,31 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_elewise_add_act_ops = True
                      )DOC")
+      .def_property(
+          "fuse_gemm_epilogue",
+          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_gemm_epilogue_ = b;
+          },
+          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
+                to fuse matmul_op, elemenewist_add_op and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_gemm_epilogue = True
+                     )DOC")
       .def_property(
           "fuse_bn_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..add332abd30eaaad1772a0b8e326ea0ae6c27e8b
--- /dev/null
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -0,0 +1,292 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+static bool PyCheckTensor(PyObject* obj);
+static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
+// Slice related methods
+static bool PyCheckInteger(PyObject* obj) {
+#if PY_VERSION_HEX < 0x03000000
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+
+static bool IsNumpyType(PyObject* obj) {
+  // It is not a good way to judge the type of obj by its type'name. Maybe using
+  // `PyArray_IsScalar` will be better. However, this interface cannot be used
+  // by including pybind11, and it needs to compile with numpy.
+  auto type_name = std::string(Py_TYPE(obj)->tp_name);
+  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
+         type_name == "numpy.int32" || type_name == "numpy.int16";
+}
+
+static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
+  if (tensor.numel() == 1) {
+    if (framework::TransToProtoVarType(tensor.type()) ==
+        framework::proto::VarType::INT32) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
+    } else if (framework::TransToProtoVarType(tensor.type()) ==
+               framework::proto::VarType::INT64) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, the type of tensor in slice indices only allows "
+          "int32 and int64, please check the type of index tensor."));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Currently, tensor in slice indices only allows 1 element, "
+        "but received %d.",
+        tensor.numel()));
+  }
+}
+
+// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
+// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
+// Original PySlice_GetIndices return wrong result when
+// slice_item contains long int, such as arr[:180L].
+// NOT sure why this happens !!!
+// Besides, PySlice_GetIndices cannot raise error when float in slice item.
+// So, I make a revised version of PySlice_GetIndices, named to
+// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
+// PySlice_GetIndices in the future.
+static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
+                               Py_ssize_t* start, Py_ssize_t* stop,
+                               Py_ssize_t* step) {
+  /* XXX support long ints */
+  if (r->step == Py_None) {
+    *step = 1;
+  } else {
+    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
+      *step = PyLong_AsLong(r->step);
+    } else if (PyCheckTensor(r->step)) {
+      *step = GetSliceIndexFromPyObject(r->step);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->step)->tp_name)));
+    }
+  }
+  if (r->start == Py_None) {
+    *start = *step < 0 ? length - 1 : 0;
+  } else {
+    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
+      *start = PyLong_AsLong(r->start);
+    } else if (PyCheckTensor(r->start)) {
+      *start = GetSliceIndexFromPyObject(r->start);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->start)->tp_name)));
+    }
+    if (*start < 0) *start += length;
+    *start = std::max(*start, static_cast<Py_ssize_t>(0));
+  }
+  if (r->stop == Py_None) {
+    *stop = *step < 0 ? -1 : length;
+  } else {
+    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
+      *stop = PyLong_AsLong(r->stop);
+    } else if (PyCheckTensor(r->stop)) {
+      *stop = GetSliceIndexFromPyObject(r->stop);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->stop)->tp_name)));
+    }
+    if (0 < *step && *stop < 0) *stop += length;
+    *stop = std::min(*stop, length);
+  }
+  if (*stop > length) return -1;
+  if (*start >= length) return -1;
+  if (*step == 0) return -1;
+  return 0;
+}
+
+static void ParseIndexingSlice(
+    framework::LoDTensor* tensor, PyObject* _index,
+    std::vector<int>* slice_axes, std::vector<int>* slice_starts,
+    std::vector<int>* slice_ends, std::vector<int>* slice_strides,
+    std::vector<int>* decrease_axis, std::vector<int>* none_axes,
+    std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
+    bool* list_select_flag) {
+  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
+  // types, and list of Bool and Integers.
+  // wrap to tuple
+
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
+  PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("tensor has not been initialized"));
+  const auto& shape = tensor->dims();
+  const int rank = shape.size();
+  const int size = PyTuple_GET_SIZE(index);
+
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
+  int ell_count = 0;
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index, dim);
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    } else if (slice_item == Py_Ellipsis) {
+      ell_count++;
+    }
+  }
+
+  PADDLE_ENFORCE_LE(ell_count, 1,
+                    platform::errors::InvalidArgument(
+                        "An index can only have a single ellipsis ('...')"));
+  int none_count = 0;
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject* slice_item = PyTuple_GetItem(index, i);
+
+    infer_flags->push_back(1);
+    int dim_len = shape[dim];
+    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
+      // integer, PyLong_AsLong supports both int and long
+      int start = static_cast<int>(PyLong_AsLong(slice_item));
+      auto s_t = start;
+      start = start < 0 ? start + dim_len : start;
+
+      PADDLE_ENFORCE(
+          0 <= start && start < dim_len,
+          platform::errors::OutOfRange("The starting index %d of slice is out "
+                                       "of bounds in tensor %d-th axis, it "
+                                       "shound be in the range of [%d, %d).",
+                                       s_t, dim, -dim_len, dim_len));
+
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(start + 1);
+      slice_strides->push_back(1);
+      decrease_axis->push_back(dim);
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
+      // slice item
+      Py_ssize_t start, end, step;
+      PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
+      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
+
+      // :: or : or 0:dim_len:1
+      if (start == 0 && end == dim_len && step == 1) {
+        dim++;
+        continue;
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(end);
+      slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else if (slice_item == Py_None) {
+      none_axes->push_back(dim + none_count);
+      none_count++;
+    } else if (PyList_Check(slice_item)) {
+      *list_select_flag = true;
+      PADDLE_ENFORCE_EQ(
+          size, 1,
+          platform::errors::InvalidArgument(
+              "When index contains a list, its length is excepted to 1, "
+              "but received %d",
+              size));
+      bool all_bool = true;
+      int list_size = PyList_GET_SIZE(slice_item);
+      for (int j = 0; j < list_size; ++j) {
+        PyObject* list_item = PyList_GetItem(slice_item, j);
+        if (PyCheckInteger(list_item)) {
+          all_bool = false;
+        } else if (!PyBool_Check(list_item)) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support int or bool in index list."));
+        }
+      }
+      if (all_bool) {
+        PADDLE_ENFORCE_EQ(
+            list_size, shape[0],
+            platform::errors::InvalidArgument(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension 0, the target dimension is %d, but received %d.",
+                shape[0], list_size));
+
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (list_item == Py_True) {
+            list_select_idxs->push_back(j);
+          }
+        }
+      } else {
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (PyCheckInteger(list_item)) {
+            list_select_idxs->push_back(
+                static_cast<int>(PyLong_AsLong(list_item)));
+          } else if (list_item == Py_True) {
+            list_select_idxs->push_back(1);
+          } else {
+            list_select_idxs->push_back(0);
+          }
+        }
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, Tensor.__indices__() only allows indexing "
+          "by Integers, Slices, Ellipsis, None, tuples of these types "
+          "and list of Bool and Integers, but received "
+          "%s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
+    }
+  }
+
+  // valid_index is the number of dimensions exclude None index
+  const int valid_indexs = size - none_axes->size() - ell_count;
+  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
+                    platform::errors::InvalidArgument(
+                        "Too many indices (%d) for tensor of dimension %d.",
+                        valid_indexs, rank));
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index e7abd64ec4439611c307440597c7278cabb03ab9..6849fcb039410f95d829b9bb793a856f1485bd6c 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -393,10 +393,10 @@ void SetTensorFromPyArrayT(
   } else if (paddle::platform::is_custom_place(place)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
     platform::Place tmp_place = place;
-    platform::DeviceGuard guard(tmp_place);
+    phi::DeviceGuard guard(tmp_place);
     auto dst = self->mutable_data<T>(place);
 
-    platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
+    phi::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
         reinterpret_cast<void *>(dst),
         const_cast<void *>(reinterpret_cast<const void *>(array.data())),
         array.nbytes());
@@ -585,14 +585,20 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj,
   auto &step = *pstep;
   auto &slicelength = *pslicelength;
   const framework::DDim &srcDDim = self.dims();
-  if (dim < 0 || dim >= srcDDim.size()) {
-    throw py::index_error();
-  }
+  PADDLE_ENFORCE(
+      0 <= dim && dim < srcDDim.size(),
+      platform::errors::OutOfRange("The dim %d of slice is out of bounds, it "
+                                   "shound be in the range of [0, %d).",
+                                   dim, srcDDim.size()));
+
   if (py::isinstance<py::slice>(obj)) {
     size_t lstart, lstop, lstep, lslicelength;
     py::slice s = static_cast<py::slice>(obj);
     if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
-      throw py::index_error();
+      PADDLE_THROW(platform::errors::OutOfRange(
+          "Slice on dim: %d is error, please check the validity of tensor "
+          "dims or slice item.",
+          dim));
     }
     start = static_cast<int64_t>(lstart);
     stop = static_cast<int64_t>(lstop);
@@ -600,15 +606,19 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj,
     slicelength = static_cast<int64_t>(lslicelength);
   } else if (py::isinstance<py::int_>(obj)) {
     start = static_cast<int64_t>(static_cast<py::int_>(obj));
-    if (std::abs(start) >= srcDDim[dim]) {
-      throw py::index_error();
-    }
+    PADDLE_ENFORCE(
+        std::abs(start) < srcDDim[dim],
+        platform::errors::OutOfRange("The start %d of slice is out of bounds, "
+                                     "it shound be in the range of (%d, %d).",
+                                     start, -srcDDim[dim], srcDDim[dim]));
     start = (start >= 0) ? start : srcDDim[dim] - start;
     stop = start + 1;
     step = 1;
     slicelength = 1;
   } else {
-    throw py::index_error();
+    PADDLE_THROW(
+        platform::errors::OutOfRange("Index object error, the index object for "
+                                     "slice only supports slice(::) and int."));
   }
 }
 
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index f2768f3dfa88d3405008baa7662f5e209ca3954c..4e273f6d551edd74ec979e6ec34aedabdb58bd10 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -90,14 +90,11 @@ add_subdirectory(tests)
 set(infrt_mlir_incs
         basic_kernels_inc
         test_kernels_inc
-        infrt_base_inc
         tensor_shape_inc
         dense_tensor_inc
         pd_ops_inc
         pd_extra_ops_inc
-        rewrite_inc
         trt_ops_inc
-        pd_lower_to_trt_inc
         )
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 28f63db49f4baec12bb43afa9034d5578d9f6cb1..0500a8123044cd05695c5167b1afaa48a6027b57 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -24,6 +24,7 @@
 
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -41,7 +42,6 @@
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using namespace infrt::tensor;        // NOLINT
-using infrt::dt::TensorMapType;       // NOLINT
 
 namespace infrt {
 
@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
       auto arg = predict_func.getArgument(i);
       auto type = arg.getType();
       // this param is TensorMap
-      if (type.isa<TensorMapType>()) {
+      if (type.isa<infrt::DenseTensorMapType>()) {
         auto* value = new host_context::Value(std::move(*map));
         arguments_.push_back(value);
         AddValue(predict_func.getArgument(i), value);
@@ -144,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
 
     // process results
     auto& last_op = predict_func.front().back();
-    if (last_op.getName().getStringRef() == "Infrt.return") {
+    if (last_op.getName().getStringRef() == "infrt.return") {
       for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
         auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
         results_.push_back(ValueRef(value));
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 9d0e3bc4fbb3158147283c1992cf1fee70c9b90d..5713fdbbaf82b2ea2190d2ee1b1dc5d944f2c262 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace infrt {
@@ -20,6 +21,14 @@ class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
+
+  CpuPhiContext() {
+    Init();
+    SetAllocator(alloc_.get());
+  }
+
+ private:
+  std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 54b7bc3e8af835077fcd2ac00d33b15e4ae3f95c..12cf14060e27c1d58e3fd9b14cc12b3c1f7f8907 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -17,8 +17,8 @@
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -86,7 +86,7 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
 TEST(trt, run_static) {
-  TRTEngine static_trt_engine(0);
+  TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
       static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
   BuildOptions static_build_options;
@@ -164,7 +164,7 @@ TEST(trt, run_static) {
 }
 
 TEST(trt, run_dynamic) {
-  TRTEngine engine(0);
+  TrtEngine engine(0);
   auto net = ConstructNetwork(
       engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
   BuildOptions build_options;
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a204fe42b45080b0ba5526473622f34e4fe4ef41..232653e8c41f71fd9bb32c9eac302b047d122b66 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -17,7 +17,7 @@
 
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
+#include <glog/logging.h>
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
@@ -40,26 +40,26 @@ static nvinfer1::IRuntime* createInferRuntime(
       phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
 }
 
-TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
   FreshDeviceId();
   logger_.reset(new TrtLogger());
   builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
   phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
 }
 
-nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
   CHECK_NOTNULL(builder_);
   return builder_.get();
 }
 
-void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+void TrtEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
                       const BuildOptions& build_options) {
   FreshDeviceId();
   ModelToBuildEnv(std::move(network), build_options);
   CHECK_NOTNULL(engine_);
 }
 
-bool TRTEngine::ModelToBuildEnv(
+bool TrtEngine::ModelToBuildEnv(
     TrtUniquePtr<nvinfer1::INetworkDefinition> network,
     const BuildOptions& build) {
   CHECK_NOTNULL(builder_);
@@ -70,7 +70,7 @@ bool TRTEngine::ModelToBuildEnv(
   return true;
 }
 
-bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+bool TrtEngine::NetworkToEngine(const BuildOptions& build) {
   TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
   CHECK_NOTNULL(config);
   CHECK(SetupNetworkAndConfig(build, *network_, *config));
@@ -91,7 +91,7 @@ bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
   return true;
 }
 
-bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
                                       INetworkDefinition& network,
                                       IBuilderConfig& config) {
   builder_->setMaxBatchSize(build.max_batch);
@@ -235,7 +235,7 @@ bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
-bool TRTEngine::SetUpInference(
+bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
     const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
     std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
@@ -261,7 +261,7 @@ bool TRTEngine::SetUpInference(
   return true;
 }
 
-void TRTEngine::Run(const phi::GPUContext& ctx) {
+void TrtEngine::Run(const phi::GPUContext& ctx) {
   if (is_dynamic_shape_) {
     DynamicRun(ctx);
   } else {
@@ -269,7 +269,7 @@ void TRTEngine::Run(const phi::GPUContext& ctx) {
   }
 }
 
-void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -303,7 +303,7 @@ void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
       runtime_batch, buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -339,14 +339,14 @@ void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
   contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::FreshDeviceId() {
+void TrtEngine::FreshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
   CHECK_LT(device_id_, count);
   phi::backends::gpu::SetDeviceId(device_id_);
 }
 
-void TRTEngine::GetEngineInfo() {
+void TrtEngine::GetEngineInfo() {
 #if IS_TRT_VERSION_GE(8200)
   LOG(INFO) << "====== engine info ======";
   std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index f72bdaf3ac0b463d086e9aeda62823cc725f2db9..3c8243e3c3838e30eb70877f8c82d623c103eaff 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -56,13 +56,18 @@ using namespace nvinfer1;  // NOLINT
 //
 // We have encapsulated this logic, please use the following programming model.
 //
-// TRTEngine trt_engine;
+// TrtEngine trt_engine;
 // trt_engine.Build(...);
 // trt_engine.SetUpInference(...);
 // trt_engine.Run(...);
-class TRTEngine {
+class TrtEngine {
  public:
-  explicit TRTEngine(int device_id);
+  explicit TrtEngine(int device_id = 0);
+
+  TrtEngine(const TrtEngine&) = delete;
+  TrtEngine& operator=(const TrtEngine&) = delete;
+  TrtEngine(TrtEngine&&) = default;
+  TrtEngine& operator=(TrtEngine&&) = default;
 
   nvinfer1::IBuilder* GetTrtBuilder();
 
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
index 4b129af1d53810c6d37d23270c1118023ae7b3f6..c66a850ffb1cc23a24074cbedaed62f7ec87beec 100644
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -15,16 +15,17 @@
 
 #pragma once
 
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include <glog/logging.h>
+
 #include <algorithm>
 #include <cassert>
 #include <functional>
 #include <memory>
 #include <unordered_map>
 
-#include <NvInfer.h>
-#include <NvInferRuntime.h>
-#include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index e35989da2085b21f4dbfaadea05793fc9dcb8753..a3f2d0afafc417cc7a4cbba8a3d6bfa92c9bef00 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -2,26 +2,20 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     dialect.cc
-    basic_kernels.cc
-    test_kernels.cc
-    infrt_base.cc
-    init_infrt_dialects.cc
+    init_dialects.cc
     tensor_shape.cc
     dense_tensor.cc
     mlir_loader.cc
     diagnostic_utils.cc
-    pd_types.cc
     pd_ops.cc
     )
 
-mlir_tablegen_on(basic_kernels)
-mlir_tablegen_on(test_kernels)
-mlir_tablegen_on(infrt_base DIALECT Infrt)
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
 mlir_tablegen_on(pd_op_base DIALECT pd)
 mlir_tablegen_on(pd_ops)
 mlir_tablegen_on(pd_extra_ops)
+
 mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 49d6887ada0322065946f95c9e39d932f268375e..7b8d48ff7164765a1949659838e541068fdee4c4 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -38,23 +38,6 @@ void DTDialect::initialize() {
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
-
-TensorMapType TensorMapType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-TensorMapType TensorMapType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
-StringType StringType::get() {
-  return Base::get(::infrt::Global::getMLIRContext());
-}
-
-StringType StringType::get(mlir::MLIRContext *context) {
-  return Base::get(context);
-}
-
 static mlir::Type getTensorType(mlir::MLIRContext *context) {
   auto t_dialect = mlir::Identifier::get("t", context);
   return mlir::OpaqueType::get(t_dialect, "tensor");
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index b0a1ea412c53eb677fed1a1b76e704f3f3da11e5..7fbd1e8a4efe1e9dc1d022beb7673ee8a59c7e36 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,28 +19,7 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-
-namespace infrt {
-namespace dt {
-class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
-                                                  mlir::Type,
-                                                  mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static TensorMapType get();
-  static TensorMapType get(mlir::MLIRContext *context);
-};
-
-class StringType
-    : public mlir::Type::TypeBase<StringType, mlir::Type, mlir::TypeStorage> {
- public:
-  using Base::Base;
-  static StringType get();
-  static StringType get(mlir::MLIRContext *context);
-};
-}  // namespace dt
-}  // namespace infrt
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 7e6e838a72372d2f850d4fb37f6b2218577ba0ed..666c7b300af33db0c27e5b3ab8a74aa4b1591c9b 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -2,7 +2,7 @@
 #else
 #define DT_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/tensor_shape_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
@@ -105,11 +105,10 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
   }];
 
   // input path of model params.
-  let arguments = (ins StringType:$path);
-  let results = (outs TensorMapType);
+  let arguments = (ins StrAttr:$path);
+  let results = (outs DenseTensorMap:$out);
 
-  let assemblyFormat = "`(` operands `)` attr-dict";
-  let verifier = ?;
+  let assemblyFormat = "`(``)`attr-dict";
 }
 
 
@@ -122,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 
   // input path of model params.
   let arguments = (ins
-          TensorMapType:$map,
+          DenseTensorMap:$map,
           StrAttr:$name
           );
   let results = (outs DenseTensor:$output);
@@ -137,7 +136,7 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
     An operation that get the size of a TensorMap.
   }];
 
-  let arguments = (ins TensorMapType:$map);
+  let arguments = (ins DenseTensorMap:$map);
   let results = (outs I32:$size);
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index daf710e0baf54549a2cc3e7a6e87c7b76a169f29..5f65336453fbdf82f30948aeea8dc52b0367159b 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -1,15 +1,3 @@
-core_gather_headers()
-
-gather_srcs(infrt_src SRCS
-    common_type.cc
-    infrt_dialect.cc
-    )
-
-
-add_mlir_dialect(infrt_ops infrt)
-
-set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
-mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
-mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
-add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
-add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+add_subdirectory(common)
+add_subdirectory(ir)
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f693c82b5060ef35eecbc1ef9ad5053d6b93e4ad
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
@@ -0,0 +1,6 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    types.cc
+    utils.cc
+    )
diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common/types.cc
similarity index 77%
rename from paddle/infrt/dialect/infrt/common_type.cc
rename to paddle/infrt/dialect/infrt/common/types.cc
index 5cbd7b2cd6153f3724bc357811bdb0894eeb64ba..62419a196288bb052a9f240ecc25f34c102a5b35 100644
--- a/paddle/infrt/dialect/infrt/common_type.cc
+++ b/paddle/infrt/dialect/infrt/common/types.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 
@@ -43,46 +43,49 @@ llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
     return llvm::None;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) {
+llvm::StringRef GetString(TargetType type) {
+  llvm::StringRef str;
   switch (type) {
     case (TargetType::CPU):
-      os << "CPU";
+      str = "CPU";
       break;
     case (TargetType::GPU):
-      os << "GPU";
+      str = "GPU";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) {
+llvm::StringRef GetString(LayoutType type) {
+  llvm::StringRef str;
   switch (type) {
     case (LayoutType::NCHW):
-      os << "NCHW";
+      str = "NCHW";
       break;
     case (LayoutType::NHWC):
-      os << "NHWC";
+      str = "NHWC";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) {
+llvm::StringRef GetString(PrecisionType type) {
+  llvm::StringRef str;
   switch (type) {
     case (PrecisionType::FLOAT32):
-      os << "FP32";
+      str = "FP32";
       break;
     case (PrecisionType::FLOAT16):
-      os << "FP16";
+      str = "FP16";
       break;
     default:
-      os << "Unsupported";
+      str = "Unsupported";
   }
-  return os;
+  return str;
 }
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common/types.h
similarity index 85%
rename from paddle/infrt/dialect/infrt/common_type.h
rename to paddle/infrt/dialect/infrt/common/types.h
index 436e7920ca5c66d944fa766f52656339499912c0..2ebe2b8ccdba6943d81e46ec747144fd0835d7e0 100644
--- a/paddle/infrt/dialect/infrt/common_type.h
+++ b/paddle/infrt/dialect/infrt/common/types.h
@@ -54,8 +54,13 @@ llvm::Optional<TargetType> GetTargetType(llvm::StringRef key);
 llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key);
 llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key);
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type);
+llvm::StringRef GetString(TargetType type);
+llvm::StringRef GetString(LayoutType type);
+llvm::StringRef GetString(PrecisionType type);
 
+template <typename T>
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, T type) {
+  os << GetString(type);
+  return os;
+}
 }  // end namespace infrt
diff --git a/paddle/infrt/dialect/infrt/common/utils.cc b/paddle/infrt/dialect/infrt/common/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ffb23c490f8f52044d35d20508f42f3f9a89413
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common/utils.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/common/utils.h"
+
+mlir::SmallVector<mlir::Value, 4> infrt::cvtValueToValueRange(
+    const mlir::Value &operand) {
+  return mlir::SmallVector<mlir::Value, 4>(1, operand);
+}
+
+mlir::SmallVector<mlir::Value, 4> infrt::concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
+  mlir::SmallVector<mlir::Value, 4> operands;
+  operands.append(operand_0.begin(), operand_0.end());
+  operands.append(operand_1.begin(), operand_1.end());
+  return operands;
+}
diff --git a/paddle/fluid/operators/searchsorted_op.cu b/paddle/infrt/dialect/infrt/common/utils.h
similarity index 57%
rename from paddle/fluid/operators/searchsorted_op.cu
rename to paddle/infrt/dialect/infrt/common/utils.h
index 4633ab43efba121cf4c55a877d90b974690952ec..886407b56649a296046d570826cf2b1b0e8aade8 100644
--- a/paddle/fluid/operators/searchsorted_op.cu
+++ b/paddle/infrt/dialect/infrt/common/utils.h
@@ -12,12 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/searchsorted_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    searchsorted, ops::SearchSortedKernel<plat::CUDADeviceContext, float>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, double>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, int>,
-    ops::SearchSortedKernel<plat::CUDADeviceContext, int64_t>);
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
+
+namespace infrt {
+
+mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
+    const mlir::Value &operand);
+
+mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1);
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
deleted file mode 100644
index 00f94805c7db22e170c7395598bfe647174339c1..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ /dev/null
@@ -1,19 +0,0 @@
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
-
-// Op definition
-class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
-
-  // Each registered op needs to provide all of a printer, parser and verifier.
-  // let printer = [{ return infrt::print(p, *this); }];
-  // let verifier = [{ return infrt::verify(*this); }];
-  // let parser = [{ return infrt::parse$cppClass(parser, result); }];
-}
-
-def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
-  let summary = "kernel op";
-  let description = [{kernel op!}];
-  let arguments = (ins Variadic<AnyType>:$operands,
-                       StrAttr:$name,
-                       OptionalAttr<DictionaryAttr>:$attrs);
-  let results = (outs Variadic<AnyType>);
-}
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
deleted file mode 100644
index f19912dc0cd59674b4b45448266b1ff4bed530b5..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef INFRT_OPS_BASE
-#define INFRT_OPS_BASE
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-def Infrt_Dialect : Dialect {
-  let summary =
-    "A dialect containing the Infrt Attributes, Operations, and Types";
-
-  let name = "infrt";
-  let cppNamespace = "::infrt";
-}
-
-// Type definitions
-
-// Base class for Infrt dialect types.
-class Infrt_Type<string name, list<Trait> traits = [],
-                   string baseCppClass = "::mlir::Type">
-    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
-}
-
-def LoDTensor : Infrt_Type<"LoDTensor"> {
-  let summary = "infrt lod tensor";
-  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
-  let parameters = (ins
-    ArrayRefParameter<"int64_t">:$shape,
-    "mlir::Type":$elementType,
-    "int32_t":$lod_level
-  );
-}
-
-def DenseTensor : Infrt_Type<"DenseTensor"> {
-  let summary = "infrt dense tensor";
-  let description = [{dense_tensor<, 3>}];
-  let parameters = (ins
-    "::infrt::TargetType":$target,
-    "::infrt::PrecisionType":$precision,
-    "::infrt::LayoutType":$layout
-
-  );
-}
-
-// Type Constrait for concrete DenseTensor type.
-class DenseTensor<string target, string precision, string layout> :
-    Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
-    "!infrt.DenseTensor<"#target#","#precision#","#layout#">", 
-    "::infrt::DenseTensorType">;
-
-// Base class for infrt dialect attributes.
-class Infrt_Attr<string name, list<Trait> traits = [],
-                   string baseCppClass = "::mlir::Attribute">
-    : AttrDef<Infrt_Dialect, name, traits, baseCppClass> {
-  let mnemonic = ?;
-}
-#endif // INFRT_OPS_BASE
diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c009bdb267e6ea1dd5a5fb392f64dddb7a05f06
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
@@ -0,0 +1,18 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_dialect.cc
+    basic_kernels.cc
+    test_kernels.cc
+    )
+
+add_mlir_dialect(infrt_ops infrt)
+
+set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
+mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
+mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
+add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
+add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+
+mlir_tablegen_on(basic_kernels)
+mlir_tablegen_on(test_kernels)
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc
similarity index 63%
rename from paddle/infrt/dialect/basic_kernels.cc
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.cc
index c1aa75fb24650b99ea8371c0ecbe7e572df2f0ce..ba83f3e36c94a173accad9fb6e746eaec0ec8e6c 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
@@ -30,23 +30,6 @@ namespace infrt {
 namespace dialect {
 using namespace mlir;  // NOLINT
 
-static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
-                               OperationState &result) {  // NOLINT
-  SymbolRefAttr callee_attr;
-  FunctionType callee_type;
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  auto callee_loc = parser.getNameLoc();
-  if (parser.parseAttribute(callee_attr, "callee", result.attributes) ||
-      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(callee_type) ||
-      parser.addTypesToList(callee_type.getResults(), result.types) ||
-      parser.resolveOperands(
-          operands, callee_type.getInputs(), callee_loc, result.operands))
-    return failure();
-  return success();
-}
-
 static ParseResult parseConstantOp(Type attrType,
                                    OpAsmParser &parser,       // NOLINT
                                    OperationState &result) {  // NOLINT
@@ -79,24 +62,6 @@ static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
       IntegerType::get(result.getContext(), 64), parser, result);
 }
 
-static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
-                                 OperationState &result) {  // NOLINT
-  SmallVector<OpAsmParser::OperandType, 2> opInfo;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(opInfo) ||
-                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(opInfo, types, loc, result.operands));
-}
-
-static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << op->getAttr("callee") << "(";
-  p.printOperands(op.getOperands());
-  p << ")";
-  p.printOptionalAttrDict(op->getAttrs(), {"callee"});
-  p << " : ";
-}
-
 static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
   p << " ";
   p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
@@ -127,37 +92,13 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
   printConstant(p, op);
 }
 
-static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
-  if (op.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(op.getOperands());
-    p << " : ";
-    llvm::interleaveComma(op.getOperands(), p);
-  }
-}
-
-static LogicalResult verify(CallOp op) { return success(); }
-
 static LogicalResult verify(ConstantF32Op op) { return success(); }
 static LogicalResult verify(ConstantI32Op op) { return success(); }
 static LogicalResult verify(ConstantF64Op op) { return success(); }
 static LogicalResult verify(ConstantI64Op op) { return success(); }
 
-static LogicalResult verify(ReturnOp op) {
-  auto function = dyn_cast<FuncOp>(op->getParentOp());
-
-  if (!function) return success();
-
-  auto results = function.getType().getResults();
-  if (op.getNumOperands() != results.size())
-    return op.emitOpError("has ")
-           << op.getNumOperands()
-           << " operands, but enclosing function returns " << results.size();
-
-  return success();
-}
 }  // namespace dialect
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc"
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/infrt/ir/basic_kernels.h
similarity index 92%
rename from paddle/infrt/dialect/basic_kernels.h
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.h
index b82abcd52d28f45b18824d9ea6f9e12c2ec1c574..a36f55691b716dda51120e8c4be7c956df9b9f25 100644
--- a/paddle/infrt/dialect/basic_kernels.h
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.h
@@ -18,4 +18,4 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/basic_kernels.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.hpp.inc"
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/infrt/ir/basic_kernels.td
similarity index 63%
rename from paddle/infrt/dialect/basic_kernels.td
rename to paddle/infrt/dialect/infrt/ir/basic_kernels.td
index aadc146e36280f79902f3b9ed90f3203fb9e5384..60315b45dd0dfaee8437c1dd312691445fdede56 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.td
@@ -4,10 +4,10 @@
 #else
 #define BASIC_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
-class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 
   // Each registered op needs to provide all of a printer, parser and verifier.
   let printer = [{ return infrt::dialect::print(p, *this); }];
@@ -15,23 +15,6 @@ class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, m
   let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
 }
 
-def CallOp : INFRT_Op<"call"> {
-  let summary = "call a host operation";
-  let description = [{
-      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
-
-          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
-    }];
-
-  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
-  let results = (outs Variadic<AnyType>);
-
-  let extraClassDeclaration = [{
-      mlir::StringRef getCallee() { return callee(); }
-      mlir::FunctionType getCalleeType();
-    }];
-}
-
 class ConstantOp<string suffix, Type baseType, Attr attr>
     : INFRT_Op<"constant." # suffix, [NoSideEffect]> {
   let summary = "constant value constructor in host";
@@ -45,22 +28,6 @@ def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>;
 def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>;
 def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
 
-def ReturnOp : INFRT_Op<"return", [Terminator]> {
-  let summary = "host executor return operation";
-  let description = [{
-      The "Infrt.return" operation represents a return operation within a function.
-
-        func @foo() : (i32, f8) {
-        Infrt.return %0, %1 : i32, f8
-        }
-    }];
-
-  let arguments = (ins Variadic<AnyType>:$operands);
-
-  let builders = [OpBuilder<(ins),
-                  [{ build($_builder, $_state, llvm::None); }]>];
-}
-
 class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
   let summary = "infrt.add operation";
   let description = [{
@@ -111,25 +78,13 @@ def PrintI64Op : PrintOp<"i64", I64>;
 def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
-def GetStringOp : INFRT_Op<"get_string"> {
-  let summary = "Infrt.get_string";
-  let description = [{
-    Get a !infrt.string value from the given string attribute.
-  }];
-
-  let arguments = (ins StrAttr:$value);
-  let results = (outs StringType);
-  let assemblyFormat = "`(` $value `)` attr-dict";
-  let verifier = ?;
-}
-
 def PrintStringOp : INFRT_Op<"print_string"> {
-  let summary = "Infrt.print_string";
+  let summary = "infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
 
-  let arguments = (ins StringType:$input);
+  let arguments = (ins StrAttr:$input);
   let results = (outs);
   let assemblyFormat = "`(` $input `)` attr-dict";
   let verifier = ?;
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td
new file mode 100644
index 0000000000000000000000000000000000000000..c5130e89bb13a58a0aa0cf3aeae1b00e269eb259
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td
@@ -0,0 +1,121 @@
+#ifndef INFRT_OPS_BASE
+#define INFRT_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def Infrt_Dialect : Dialect {
+  let summary =
+    "A dialect containing the Infrt Attributes, Operations, and Types";
+
+  let name = "infrt";
+  let cppNamespace = "::infrt";
+  let useDefaultAttributePrinterParser = 1;
+}
+
+// Type definitions
+// Base class for Infrt dialect types.
+class Infrt_Type<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Type">
+    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
+}
+
+class Infrt_EnumParam<string cppEnumType, string stringToSymbolFnName,
+  string symbolToStringFnName, string desc = ""> : TypeParameter<cppEnumType, desc> {
+  let parser = [{[&]() -> ::mlir::FailureOr<}] # cppEnumType # [{> {
+    ::llvm::StringRef enumKeyword;
+    if (::mlir::failed($_parser.parseKeyword(&enumKeyword)))
+      return ::mlir::failure();
+    auto maybeEnum = }] # stringToSymbolFnName # [{(enumKeyword);
+    if (maybeEnum)
+      return *maybeEnum;
+    llvm_unreachable("}] # cppEnumType # [{ can not be found.");
+    return {};
+  }()}];
+  let printer = "$_printer << " # symbolToStringFnName # "($_self)";
+}
+
+def TargetParam : Infrt_EnumParam<"::infrt::TargetType", "GetTargetType", "GetString">;
+def PrecisionParam : Infrt_EnumParam<"::infrt::PrecisionType", "GetPrecisionType", "GetString">;
+def LayoutParam : Infrt_EnumParam<"::infrt::LayoutType", "GetLayoutType", "GetString">;
+
+def TargetAttr : AttrDef<Infrt_Dialect, "Target"> {
+  let mnemonic = "target";
+  let parameters = (ins
+    TargetParam:$target
+  );
+  let assemblyFormat = "`<` $target `>`";
+}
+
+def PrecisionAttr : AttrDef<Infrt_Dialect, "Precision"> {
+  let mnemonic = "precision";
+  let parameters = (ins
+    PrecisionParam:$precision
+  );
+  let assemblyFormat = "`<` $precision `>`";
+}
+
+def LayoutAttr : AttrDef<Infrt_Dialect, "Layout"> {
+  let mnemonic = "layout";
+  let parameters = (ins
+    LayoutParam:$layout
+  );
+  let assemblyFormat = "`<` $layout `>`";
+}
+
+def LoDTensor : Infrt_Type<"LoDTensor"> {
+  let summary = "infrt lod tensor";
+  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
+  let parameters = (ins
+    ArrayRefParameter<"int64_t">:$shape,
+    "mlir::Type":$elementType,
+    "int32_t":$lod_level
+  );
+}
+
+def DenseTensor : Infrt_Type<"DenseTensor"> {
+  let summary = "infrt dense tensor";
+  let description = [{dense_tensor<, 3>}];
+  let parameters = (ins
+    "::infrt::TargetType":$target,
+    "::infrt::PrecisionType":$precision,
+    "::infrt::LayoutType":$layout
+  );
+}
+
+def DenseTensorMap :  Infrt_Type<"DenseTensorMap"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
+// Type Constrait for concrete DenseTensor type.
+class DenseTensor<string target, string precision, string layout> :
+    Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
+    "!infrt.DenseTensor<"#target#","#precision#","#layout#">", 
+    "::infrt::DenseTensorType">;
+
+// Base class for infrt dialect attributes.
+class Infrt_Attr<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Attribute">
+    : AttrDef<Infrt_Dialect, name, traits, baseCppClass> {
+  let mnemonic = ?;
+}
+
+// tools function. used for pattern rewriter
+class INFRT_createI32Attr<string value> : NativeCodeCall<
+    "$_builder.getI32IntegerAttr(" # value # ")">;
+
+class INFRT_createSI32Attr<string value> : NativeCodeCall<
+    "$_builder.getSI32IntegerAttr(" # value # ")">;
+
+class INFRT_createF32Attr<string value> : NativeCodeCall<
+    "$_builder.getF32FloatAttr(" # value # ")">;
+
+def INFRT_cvtValueToValueRange : NativeCodeCall<
+    "infrt::cvtValueToValueRange($0)">;
+
+def INFRT_concatTwoValueRange : NativeCodeCall<
+    "infrt::concatTwoValueRange($0, $1)">;
+
+#endif // INFRT_OPS_BASE
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
similarity index 84%
rename from paddle/infrt/dialect/infrt/infrt_dialect.cc
rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index abb60016f90233cae68dc99e95885042517e9212..42de08ebc41938c40675435d4af10f758c52052b 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -12,39 +12,52 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/DialectImplementation.h>
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.cpp.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc"
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.h"
 
 namespace infrt {
 
 void InfrtDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc"  // NOLINT
       >();
 
   addAttributes<
 #define GET_ATTRDEF_LIST
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc"  // NOLINT
       >();
 
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc"  // NOLINT
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
       >();
 }
 
@@ -127,7 +140,7 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
 
 void InfrtDialect::printType(::mlir::Type type,
                              ::mlir::DialectAsmPrinter &os) const {
-  // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5>
+  // print LoDTensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5>
   if (type.isa<infrt::LoDTensorType>()) {
     auto lod_tensor_type = type.cast<infrt::LoDTensorType>();
     os << "lod_tensor<";
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
similarity index 77%
rename from paddle/infrt/dialect/infrt/infrt_dialect.h
rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.h
index ed5b36e556149dbc3026e732cf953c5562841921..3e6ea2a74c79d43015a62f166928e10adb48698a 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
@@ -22,14 +22,14 @@
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
-#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc"
 
 #define GET_ATTRDEF_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.h.inc"
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_ops.h.inc"
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..f5430b03d0d75cfa8ba91f03ebc90ee0f73c25d7
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
@@ -0,0 +1,61 @@
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+
+// Op definition
+class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
+
+  // Each registered op needs to provide all of a printer, parser and verifier.
+  // let printer = [{ return infrt::print(p, *this); }];
+  // let verifier = [{ return infrt::verify(*this); }];
+  // let parser = [{ return infrt::parse$cppClass(parser, result); }];
+}
+
+def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
+  let summary = "kernel op";
+  let description = [{kernel op!}];
+  let arguments = (ins Variadic<AnyType>:$operands,
+                       StrAttr:$name,
+                       OptionalAttr<DictionaryAttr>:$attrs);
+  let results = (outs Variadic<AnyType>);
+}
+
+def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
+def Infrt_CallOp : Infrt_Op<"call"> {
+  let summary = "call a host operation";
+  let description = [{
+      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
+
+          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
+    }];
+
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  //let extraClassDeclaration = [{
+  //    mlir::StringRef getCallee() { return callee(); }
+  //    mlir::FunctionType getCalleeType();
+  //  }];
+  let assemblyFormat = [{
+    $callee `(` $operands `)` attr-dict `:` functional-type($operands, results)
+  }];
+}
+
+def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
+  let summary = "convert tensor type op";
+  let description = [{convert tensor type op!}];
+  let arguments = (ins AnyType:$input);
+  let results = (outs AnyType:$output);
+}
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/infrt/ir/test_kernels.cc
similarity index 96%
rename from paddle/infrt/dialect/test_kernels.cc
rename to paddle/infrt/dialect/infrt/ir/test_kernels.cc
index f0c4723b49a7906cf5327771e26eb87e8b1248c0..5f7f83a9dfa8011b3043e20da7d9f21f3afe5cf6 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/test_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/OpDefinition.h>
@@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
-  if (last_op.getName().getStringRef() != "Infrt.return") {
+  if (last_op.getName().getStringRef() != "infrt.return") {
     return op.emitOpError("missing return statement");
   }
   if (last_op.getNumOperands() != 1) {
@@ -161,4 +161,4 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/infrt/ir/test_kernels.h
similarity index 92%
rename from paddle/infrt/dialect/test_kernels.h
rename to paddle/infrt/dialect/infrt/ir/test_kernels.h
index 73c8a6fb387bca6ebc7ae393e4bba32ab94aa951..1fe5020b240046f71571e3a4c999b1eae07741a1 100644
--- a/paddle/infrt/dialect/test_kernels.h
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.h
@@ -17,4 +17,4 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/test_kernels.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/test_kernels.hpp.inc"
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/infrt/ir/test_kernels.td
similarity index 93%
rename from paddle/infrt/dialect/test_kernels.td
rename to paddle/infrt/dialect/infrt/ir/test_kernels.td
index 6e4bc26aa1496dcb4caed83f98fc42dab9e3cce0..0ce1f3f65e8f7f46cf32794b3191e66ae71e3eae 100644
--- a/paddle/infrt/dialect/test_kernels.td
+++ b/paddle/infrt/dialect/infrt/ir/test_kernels.td
@@ -4,12 +4,12 @@
 #else
 #define TEST_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 // Base class for Test dialect ops.
 class Test_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+    Op<Infrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 
   // Each registered op in the Test namespace needs to provide all of a printer,
   // parser and verifier.
@@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> {
          // The following code benchmarks the infrt.add.i32 kernel.
          %x = infrt.add.i32 %c, %c
          // The benchmarked function needs to return exactly one value.
-         Infrt.return %x : i32
+         infrt.return %x : i32
        }
   }];
 
diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19c12251a2e6b4a71211fe88772d3b6759164c71
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(infrt_op_fuse)
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
new file mode 100644
index 0000000000000000000000000000000000000000..51addb4deb43824965806962613d1ab4bd1c1e3d
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -0,0 +1,23 @@
+#ifndef INFRT_OP_FUSE
+#define INFRT_OP_FUSE
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
+include "paddle/infrt/dialect/pd_ops.td"
+
+def FuseCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
+       (Infrt_CvtTensorOp $arg)>;
+
+def FuseFeedCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+       (PD_FeedOp $name)>;
+
+def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
+def RedundantCvtTensorOptPattern : Pat<
+  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+
+
+#endif // INFRT_OP_FUSE
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25ecf2ae99dc3613944fcedaee427b540f0faae4
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+namespace {
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * infrtOpFusePass.
+ */
+struct InfrtOpFusePass
+    : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+  void runOnFunction() override;
+};
+// Implementation of the InfrtOpFusePass.
+void InfrtOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  // Fuse pd.return Operation
+  auto terminator_op = getFunction().front().getTerminator();
+  if (nullptr == terminator_op) return;
+  for (auto operand : terminator_op->getOperands()) {
+    auto *op1 = operand.getDefiningOp();
+    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    if (!cvt_op) continue;
+    mlir::Value value = cvt_op.input();
+    operand.replaceAllUsesWith(value);
+    cvt_op.erase();
+  }
+}
+}  // namespace
+std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
+  return std::make_unique<InfrtOpFusePass>();
+}
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef349a7bbc4c6531bb7c02cb69a1bd5c427af080
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/Pass/Pass.h>
+
+namespace infrt {
+/*
+ * infrtOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> createInfrtOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
deleted file mode 100644
index 8c595c06745f1be8453c4d1f08ba00f4d9ceaf90..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt_base.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/infrt_base.h"
-
-#include "paddle/infrt/dialect/basic_kernels.h"
-#include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/test_kernels.h"
-
-namespace infrt {
-namespace dialect {
-
-// ----INFRTDialect definition begin----
-void INFRTDialect::initialize() {
-  allowUnknownTypes();
-  allowUnknownOperations();
-
-  addTypes<infrt::dt::StringType>();
-  addTypes<infrt::dt::TensorMapType>();
-
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
-      >();
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/test_kernels.cpp.inc"
-      >();
-}
-
-mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
-  llvm::StringRef keyword;
-  if (parser.parseKeyword(&keyword)) return mlir::Type();
-  // parse TensorMapType, for example: !infrt.tensor_map
-  if (keyword == "tensor_map") {
-    return infrt::dt::TensorMapType::get();
-  }
-  // parse StringType, for example: !infrt.string
-  if (keyword == "string") {
-    return infrt::dt::StringType::get();
-  }
-
-  parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
-      << keyword;
-  return mlir::Type();
-}
-
-void INFRTDialect::printType(mlir::Type type,
-                             mlir::DialectAsmPrinter &printer) const {
-  // print TensorMapType, for example: !infrt.tensor_map
-  if (type.isa<infrt::dt::TensorMapType>()) {
-    printer << "tensor_map";
-    return;
-  }
-  // print StringType, for example: !infrt.string
-  if (type.isa<infrt::dt::StringType>()) {
-    printer << "string";
-    return;
-  }
-  llvm_unreachable("unknown infrt type.");
-}
-
-// ----INFRTDialect definition end----
-
-}  // namespace dialect
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
deleted file mode 100644
index 3ef73171dcdea4e0367837f4b3893405c29a1580..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt_base.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-
-#include "paddle/infrt/dialect/infrt_base.hpp.inc"
-
-namespace infrt {
-namespace dialect {
-
-class INFRTDialect : public mlir::Dialect {
-  explicit INFRTDialect(mlir::MLIRContext *context)
-      : mlir::Dialect(
-            getDialectNamespace(), context, mlir::TypeID::get<INFRTDialect>()) {
-    initialize();
-  }
-
-  // parse types registered to the dialect.
-  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
-  // print types registered to the dialect.
-  void printType(mlir::Type type,
-                 mlir::DialectAsmPrinter &printer) const override;
-
-  void initialize();
-  friend class mlir::MLIRContext;
-
- public:
-  static ::llvm::StringRef getDialectNamespace() { return "Infrt"; }
-};
-}  // namespace dialect
-
-template <typename T>
-static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
-                                       mlir::Location loc,
-                                       T constant) {
-  return b.getIntegerAttr(b.getI32Type(), constant);
-}
-
-template <typename T>
-static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b,  // NOLINT
-                                        mlir::Location loc,
-                                        T constant) {
-  return b.getSI32IntegerAttr(constant);
-}
-
-template <typename T>
-static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b,  // NOLINT
-                                     mlir::Location loc,
-                                     T constant) {
-  return b.getF32FloatAttr(constant);
-}
-
-static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
-    const mlir::Value &operand) {
-  return mlir::SmallVector<mlir::Value, 4>(1, operand);
-}
-
-static mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
-    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
-  mlir::SmallVector<mlir::Value, 4> operands;
-  operands.append(operand_0.begin(), operand_0.end());
-  operands.append(operand_1.begin(), operand_1.end());
-  return operands;
-}
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
deleted file mode 100644
index 0f50eb2d8fb4ac83578f13888d05188a9143382f..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/infrt_base.td
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef INFRT_BASE
-#define INFRT_BASE
-
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
-
-def INFRT_Dialect : Dialect {
-  let name = "Infrt";
-
-  let description = [{
-    The INFRT host dialect.
-  }];
-
-  let cppNamespace = "::infrt::dialect";
-}
-
-// Type definitions
-def StringType :
-    Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
-    BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
-
-def TensorMapType :
-    Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
-    BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
-
-def BufferType : OpaqueType<"b", "buffer", "buffer">;
-
-class INFRT_createI32Attr<string value> : NativeCodeCall<
-    "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
-
-class INFRT_createSI32Attr<string value> : NativeCodeCall<
-    "infrt::createSI32Attr($_builder, $_loc, " # value # ")">;
-
-class INFRT_createF32Attr<string value> : NativeCodeCall<
-    "infrt::createF32Attr($_builder, $_loc, " # value # ")">;
-
-def INFRT_cvtValueToValueRange : NativeCodeCall<
-    "infrt::cvtValueToValueRange($0)">;
-
-def INFRT_concatTwoValueRange : NativeCodeCall<
-    "infrt::concatTwoValueRange($0, $1)">;
-#endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
similarity index 83%
rename from paddle/infrt/dialect/init_infrt_dialects.cc
rename to paddle/infrt/dialect/init_dialects.cc
index 5eae01719361dd5bc21c139b54cbcf16f226b4cc..0c5944ebf84750be8cf789552219157da3170c39 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 #include <glog/logging.h>
 
-#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
@@ -30,8 +30,7 @@
 namespace infrt {
 void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
   registry.insert<ts::TensorShapeDialect,
-                  dialect::INFRTDialect,
-                  infrt::InfrtDialect,
+                  InfrtDialect,
                   dt::DTDialect,
                   mlir::pd::PaddleDialect,
 #ifdef INFRT_WITH_PHI
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_dialects.h
similarity index 100%
rename from paddle/infrt/dialect/init_infrt_dialects.h
rename to paddle/infrt/dialect/init_dialects.h
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 1d0696e77dcda612eeb8c367958e44e2efed5354..19b8cba12df8687a907534ada30cdecc1321a3c2 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -28,7 +28,7 @@
 #include <vector>
 
 #include "paddle/infrt/dialect/diagnostic_utils.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace infrt {
 namespace dialect {
@@ -63,6 +63,7 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   mlir::DialectRegistry registry;
   registerCinnDialects(registry);
   context->appendDialectRegistry(registry);
+  context->loadAllAvailableDialects();
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 2f721e49a63096d1c3168805d373cbc8809542da..8ccb07161d364e968ead568f20c4b98b18a7e04e 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -22,7 +22,7 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace infrt {
 namespace dialect {
@@ -32,13 +32,13 @@ TEST(MlirLoader, basic) {
 
   auto source = R"ROC(
 func @main() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  "infrt.print.f32"(%v0) : (f32) -> ()
 
-  Infrt.return %value : f32
+  infrt.return %value : f32
 }
 )ROC";
 
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index 5bcf5a23f4c532b1056ceaa54c80902b32e4061a..2006530958f0b5223edfcee87a5895e101f0e240 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -14,7 +14,7 @@
 
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index 266bdf60de788df0507a5bf0ef679945cb7c2abc..f6af4c83aed8bd0b7ce04c172169b036e674777b 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -6,7 +6,7 @@
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
 def PD_Dialect : Dialect {
   let name = "pd";
@@ -75,7 +75,7 @@ def PD_ElementType : Type<Or<[PD_Float.predicate,
 // def PD_Tensor : TensorOf<[PD_ElementType]>;
 def PD_Tensor1 : TensorOf<[PD_ElementType]>;
 
-def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">;
+def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor, DenseTensor],"pd.ttype">;
 
 def PD_Tensor_Array : VectorOf<[PD_Tensor]>;
 
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 338b04e001320289b71f6127318e7a073cefcacf..96e9e307f2fd3f33be3d2273a7aa66c363e4beb1 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -16,8 +16,6 @@
 
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/PatternMatch.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
-#include "paddle/infrt/dialect/infrt_base.h"
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index b48c68060d42efdeecccd68abc5efb3045643370..e6b0f30c059054189fe3a86bb112da923ad76423 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 namespace mlir {
 namespace pd {
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
deleted file mode 100644
index 0da888a9c076922fc21d5cce004dc839bd705762..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pd_types.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file defines the types used in PaddlePaddle MLIR dialect.
-// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in
-// tensorflow).
-
-#pragma once
-
-#include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Location.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Types.h>
-
-namespace mlir {
-namespace PD {
-
-class PaddleType : public Type {
- public:
-  using Type::Type;
-
-  static bool classof(Type type);
-};
-
-namespace detail {
-
-template <typename Derived>
-class PaddleTypeImpl : public Type::TypeBase<Derived, PaddleType, TypeStorage> {
- public:
-  using Base = typename Type::TypeBase<Derived, PaddleType, TypeStorage>;
-  using PDBase = PaddleTypeImpl<Derived>;
-  using Base::Base;
-};
-
-}  // namespace detail
-
-#define HANDLE_PD_TYPE(pdtype, enumerant, name)                      \
-  class pdtype##Type : public detail::PaddleTypeImpl<pdtype##Type> { \
-   public:                                                           \
-    using PDBase::PDBase;                                            \
-  };
-
-}  // namespace PD
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index d477b6b9bdc278b2408794fa4235d9c8bca5850a..4e73a533d99a79168b3e68b88d917f48ec811444 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -5,5 +5,11 @@ endif()
 add_subdirectory(ir)
 add_subdirectory(pass)
 
+add_executable(phi-ir-exec phi_ir_exec.cc)
+target_link_libraries(phi-ir-exec infrt)
+
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
+
+gather_srcs(infrt_src SRCS
+    data_type.cc)
diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbc296ea748a39472bb7f57b04e9159b5fbd89f1
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/data_type.h"
+
+namespace infrt {
+
+phi::Backend ConvertTargetToPhi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType ConvertTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType ConvertPrecisionToPhi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
+  switch (precision) {
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+#undef CONVERT_PRECISION_TO_PHI
+}
+
+PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
+  switch (datatype) {
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
+    default:
+      return PrecisionType::UNK;
+  }
+#undef CONVERT_PRECISION_FROM_PHI
+}
+
+phi::DataLayout ConvertLayoutToPhi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType ConvertLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey ConvertPlaceToPhi(const Place& place) {
+  return phi::KernelKey(ConvertTargetToPhi(place.target),
+                        ConvertLayoutToPhi(place.layout),
+                        ConvertPrecisionToPhi(place.precision));
+}
+
+Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(ConvertTargetFromPhi(tensor_arg.backend),
+               ConvertPrecisionFromPhi(tensor_arg.dtype),
+               ConvertLayoutFromPhi(tensor_arg.layout));
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd258cb1038792e52667b0ef39c65b16c6210eb3
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/infrt/dialect/infrt/common/types.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+namespace infrt {
+
+phi::Backend ConvertTargetToPhi(TargetType target);
+TargetType ConvertTargetFromPhi(phi::Backend backend);
+
+phi::DataType ConvertPrecisionToPhi(PrecisionType precision);
+PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype);
+
+phi::DataLayout ConvertLayoutToPhi(LayoutType layout);
+LayoutType ConvertLayoutFromPhi(phi::DataLayout layout);
+
+phi::KernelKey ConvertPlaceToPhi(const Place& place);
+Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg);
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
index 907f912d9e638ba76e5010d5442381d1aa053bc2..5d7338ec4292ed49112c3cce45a30816e686886d 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -2,6 +2,8 @@
 #define PHI_BASE
 
 include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def PHI_Dialect : Dialect {
   let name = "phi";
@@ -11,27 +13,28 @@ def PHI_Dialect : Dialect {
   }];
 
   let cppNamespace = "::infrt::phi";
-}
-
-class AllocatorTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PHI_Dialect, place # "Allocator", traits> {
-    let summary = !strconcat("!phi.allocator_", place, " type");
-}
-
-class ContextTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PHI_Dialect, place # "Context", traits> {
-    let summary = !strconcat("!phi.context_", place, " type");
+  let useDefaultTypePrinterParser = 1;
 }
 
 def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
 
-def CPU_Allocator : AllocatorTypeOf<"CPU">;
-def GPU_Allocator : AllocatorTypeOf<"GPU">;
-
-def CPU_Context : ContextTypeOf<"CPU">;
-def GPU_Context : ContextTypeOf<"GPU">;
-
-def Allocator : AnyTypeOf<[CPU_Allocator, GPU_Allocator], "Allocator type">;
-def Context : AnyTypeOf<[CPU_Context, GPU_Context], "Context type">;
+class PHI_Type<string type, list<Trait> traits = []>
+   : TypeDef<PHI_Dialect, type, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
+
+def Allocator : PHI_Type<"Allocator"> {
+   let mnemonic = "allocator";
+   let parameters = (ins
+     TargetParam:$target
+   );
+   let assemblyFormat = "`<` $target `>`";
+ }
+
+ def Context : PHI_Type<"Context"> {
+   let mnemonic = "context";
+   let parameters = (ins
+     TargetParam:$target
+   );
+   let assemblyFormat = "`<` $target `>`";
+ }
 
 #endif
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index ee23470fc754a56ef323c167613f7f32982eedd8..d2ff7acfba8b26f5c0ca1ec459d3b5e2f7fb3d93 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -3,7 +3,7 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def PHI_CPUKernelDialect : Dialect {
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 39677871ff8fedcb9268066d8b10b8cf823ed5f9..8c3a79498d74d3b80e1590bbc2c0530c7af6411e 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -5,7 +5,7 @@
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
 def PHI_DenseTensorDialect : Dialect {
   let name = "phi_dt";
@@ -18,12 +18,13 @@ def PHI_DenseTensorDialect : Dialect {
 }
 
 // PHI DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
-}
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
+  mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp<string place, string dtype, string layout> 
-      : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
-  let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
+class CreateDenseTensorOp 
+      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+  let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
+    LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
 }
 
@@ -44,26 +45,19 @@ class PrintDenseTensorOp:
   let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
 
-class CreateCPUAllocatorOp
-      : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> {
+class CreateContextOp<string target>
+      : PDT_Op<"create_context." # target, [NoSideEffect]> {
   let arguments = (ins);
-  let results = (outs CPU_Allocator:$output);
-}
-
-class CreateCPUContextOp
-      : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins CPU_Allocator:$input);
-  let results = (outs CPU_Context:$output);
+  let results = (outs Context:$output);
 }
 
-def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">;
+def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
-def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
-def PDT_CreateContextOp_cpu : CreateCPUContextOp;
-def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
+def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_PrintDenseTensor : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
+  let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
   let results = (outs DenseTensor:$output);
 }
 
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index 7a6b3f3f0a404043f49a6df3e5bdcb873dd442c9..d8095d7f3f13fcfbf9b2ccab6db182850633d632 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
+#include <llvm/include/llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
@@ -27,27 +28,6 @@
 namespace infrt {
 namespace phi {
 
-void PHIDialect::printType(::mlir::Type type,
-                           mlir::DialectAsmPrinter& os) const {
-  if (type.isa<CPUAllocatorType>()) {
-    os << "CPU_Allocator";
-    return;
-  }
-  if (type.isa<GPUAllocatorType>()) {
-    os << "GPU_Allocator";
-    return;
-  }
-  if (type.isa<CPUContextType>()) {
-    os << "CPU_Context";
-    return;
-  }
-  if (type.isa<GPUContextType>()) {
-    os << "GPU_Context";
-    return;
-  }
-  llvm_unreachable("unexpected 'allocator/context' type kind");
-}
-
 void PHIDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
@@ -59,24 +39,6 @@ void PHIDialect::initialize() {
       >();
 }
 
-mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
-  llvm::StringRef keyword;
-  if (parser.parseKeyword(&keyword)) return mlir::Type();
-  if (keyword == "CPU_allocator") {
-    return CPUAllocatorType::get(parser.getContext());
-  } else if (keyword == "GPU_allocator") {
-    return GPUAllocatorType::get(parser.getContext());
-  } else if (keyword == "CPU_context") {
-    return CPUContextType::get(parser.getContext());
-  } else if (keyword == "GPU_context") {
-    return GPUContextType::get(parser.getContext());
-  } else {
-    llvm_unreachable("unexpected 'allocator/context' type kind");
-  }
-
-  return mlir::Type();
-}
-
 }  // namespace phi
 }  // namespace infrt
 
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
index a08d8229fccf53225311b451e941f99e8a3d0e8a..64cd08cc05ed42fe8d53b8c5b8a5bc994bae8824 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,12 +18,10 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
-#define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
-
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
 
@@ -41,6 +39,9 @@ class PhiOpTrait : public OpTrait::TraitBase<ConcreteType, PhiOpTrait> {
 }  // namespace OpTrait
 }  // namespace mlir
 
+#define GET_TYPEDEF_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
+
 namespace infrt {
 namespace phi {}  // namespace phi
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
index b84d1b2b7294baf789fe4e1f3911edede8172cf7..4f8b41852cc67e32c510c247e907092046731452 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.h
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -30,7 +30,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 6c0f6df892100b5a4d3c429d1248eb7851db4609..353b1054e71374987207e1055289258915e0774d 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -14,129 +14,79 @@
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include <glog/logging.h>
-#include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/phi/kernels/declarations.h"
+
 namespace infrt {
 
-phi::Backend cvtTarget2Phi(TargetType target) {
+std::string getPhiTargetPrefix(TargetType target) {
   switch (target) {
     case TargetType::CPU:
-      return phi::Backend::CPU;
+      return "phi_cpu.";
     case TargetType::GPU:
-      return phi::Backend::GPU;
-    default:
-      return phi::Backend::UNDEFINED;
-  }
-}
-
-TargetType cvtTargetFromPhi(phi::Backend backend) {
-  switch (backend) {
-    case phi::Backend::CPU:
-      return TargetType::CPU;
-    case phi::Backend::GPU:
-      return TargetType::GPU;
+      return "phi_gpu.";
     default:
-      return TargetType::UNK;
+      LOG(FATAL) << "UnSupported target type !";
+      return std::string();
   }
 }
-
-phi::DataType cvtPrecision2Phi(PrecisionType precision) {
-#define CONVERT_PRECISION_TO_PHI(Precision) \
-  case PrecisionType::Precision:            \
-    return phi::DataType::Precision;
-
+std::string getPhiPrecisionSuffix(PrecisionType precision) {
   switch (precision) {
-    CONVERT_PRECISION_TO_PHI(FLOAT32)
-    CONVERT_PRECISION_TO_PHI(FLOAT16)
-    CONVERT_PRECISION_TO_PHI(FLOAT64)
-    CONVERT_PRECISION_TO_PHI(UINT8)
-    CONVERT_PRECISION_TO_PHI(INT8)
-    CONVERT_PRECISION_TO_PHI(INT16)
-    CONVERT_PRECISION_TO_PHI(INT32)
-    CONVERT_PRECISION_TO_PHI(INT64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX128)
-    CONVERT_PRECISION_TO_PHI(BOOL)
-    default:
-      return phi::DataType::UNDEFINED;
-  }
-#undef CONVERT_PRECISION_TO_PHI
-}
-
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
-#define CONVERT_PRECISION_FROM_PHI(Precision) \
-  case phi::DataType::Precision:              \
-    return PrecisionType::Precision;
-
-  switch (datatype) {
-    CONVERT_PRECISION_FROM_PHI(FLOAT32)
-    CONVERT_PRECISION_FROM_PHI(FLOAT16)
-    CONVERT_PRECISION_FROM_PHI(FLOAT64)
-    CONVERT_PRECISION_FROM_PHI(UINT8)
-    CONVERT_PRECISION_FROM_PHI(INT8)
-    CONVERT_PRECISION_FROM_PHI(INT16)
-    CONVERT_PRECISION_FROM_PHI(INT32)
-    CONVERT_PRECISION_FROM_PHI(INT64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
-    CONVERT_PRECISION_FROM_PHI(BOOL)
+    case PrecisionType::FLOAT32:
+      return ".float32";
+    case PrecisionType::FLOAT16:
+      return ".float16";
+    case PrecisionType::FLOAT64:
+      return ".float64";
+    case PrecisionType::UINT8:
+      return ".uint8";
+    case PrecisionType::INT8:
+      return ".int8";
+    case PrecisionType::INT16:
+      return ".int16";
+    case PrecisionType::INT32:
+      return ".int32";
+    case PrecisionType::INT64:
+      return ".int64";
+    case PrecisionType::COMPLEX64:
+      return ".complex64";
+    case PrecisionType::COMPLEX128:
+      return ".complex128";
+    case PrecisionType::BOOL:
+      return ".bool";
     default:
-      return PrecisionType::UNK;
+      LOG(FATAL) << "UnSupported precision type !";
+      return std::string();
   }
-#undef CONVERT_PRECISION_FROM_PHI
 }
-
-phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+std::string getPhiLayoutSuffix(LayoutType layout) {
   switch (layout) {
     case LayoutType::NCHW:
-      return phi::DataLayout::NCHW;
+      return ".nchw";
     case LayoutType::NHWC:
-      return phi::DataLayout::NHWC;
+      return ".nhwc";
     case LayoutType::ANY:
-      return phi::DataLayout::ANY;
+      return ".any";
     default:
-      return phi::DataLayout::UNDEFINED;
+      LOG(FATAL) << "UnSupported layout type !";
+      return std::string();
   }
 }
 
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
-  switch (layout) {
-    case phi::DataLayout::NCHW:
-      return LayoutType::NCHW;
-    case phi::DataLayout::NHWC:
-      return LayoutType::NHWC;
-    case phi::DataLayout::ANY:
-      return LayoutType::ANY;
-    default:
-      return LayoutType::UNK;
-  }
-}
-
-phi::KernelKey cvtPlace2Phi(const Place& place) {
-  return phi::KernelKey(cvtTarget2Phi(place.target),
-                        cvtLayout2Phi(place.layout),
-                        cvtPrecision2Phi(place.precision));
-}
-
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
-  return Place(cvtTargetFromPhi(tensor_arg.backend),
-               cvtPrecisionFromPhi(tensor_arg.dtype),
-               cvtLayoutFromPhi(tensor_arg.layout));
-}
-
 std::vector<PhiKernelDesc> getCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces) {
   std::vector<PhiKernelDesc> candidate_kernels;
   PhiKernelDesc phi_kernel_desc;
   phi::KernelKeyMap kernel_key_map =
       phi::KernelFactory::Instance().SelectKernelMap(name);
-  for (const Place& place : valid_palces) {
-    phi::KernelKey kernel_key = cvtPlace2Phi(place);
+  for (Place place : valid_palces) {
+    phi::KernelKey kernel_key = ConvertPlaceToPhi(place);
     if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
       kernel_key = phi::KernelKey(kernel_key.backend(),
                                   phi::DataLayout::ALL_LAYOUT,
                                   kernel_key.dtype());
       if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
+      place.layout = LayoutType::ANY;
     }
     phi_kernel_desc.kernelType = place;
     phi_kernel_desc.inputsType.clear();
@@ -147,10 +97,10 @@ std::vector<PhiKernelDesc> getCandidateKernels(
     const paddle::SmallVector<phi::TensorArgDef>& output_arg =
         args_def.output_defs();
     for (auto tensor_arg : input_arg) {
-      phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg));
     }
     for (auto tensor_arg : output_arg) {
-      phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg));
     }
     candidate_kernels.emplace_back(phi_kernel_desc);
   }
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index b74107f674e51f6ca09c864d197d9334a08666ac..b1f7c6c0811def9141e8012518fff5f504934149 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -16,7 +16,7 @@
 
 #include <string>
 #include <vector>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 
@@ -26,6 +26,10 @@ struct PhiKernelDesc {
   Place kernelType;                // kernel place
 };
 
+std::string getPhiTargetPrefix(TargetType target);
+std::string getPhiPrecisionSuffix(PrecisionType precision);
+std::string getPhiLayoutSuffix(LayoutType layout);
+
 std::vector<PhiKernelDesc> getCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces);
 
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
index df3472aa01dfb8bfa0e7f6122410c1b4788cd359..485bf2a75d890aa0df5e888e7284ae3451aa514c 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -18,16 +18,35 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OperationSupport.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/ops/compat/signatures.h"
-namespace infrt {
+
+namespace {
+class phiOpCvtPass
+    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
+  void runOnFunction() override;
+  explicit phiOpCvtPass(
+      std::vector<infrt::Place> valid_places = std::vector<infrt::Place>())
+      : valid_places_(valid_places) {}
+
+ private:
+  void convertStage();
+  void diapatchStage();
+  std::vector<infrt::Place> valid_places_;
+};
+
 // Implementation of the phiOpCvtPass.
 void phiOpCvtPass::runOnFunction() {
   convertStage();
@@ -58,9 +77,9 @@ void phiOpCvtPass::convertStage() {
       continue;
     }
 
-    phi::KernelSignature kernel_sign =
-        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
-            ProtoArgumentMappingContext(op));
+    ::phi::KernelSignature kernel_sign =
+        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+            infrt::ProtoArgumentMappingContext(op));
     // resort input&output according to kernel_sign
     ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
     ::llvm::SmallVector<mlir::Type, 4> output_types;
@@ -104,13 +123,90 @@ void phiOpCvtPass::diapatchStage() {
     infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
     if (nullptr != kernel_op) worklist.push_back(kernel_op);
   }
-  // ToDo: implementation in the next PR
-  while (!worklist.empty()) {
-    // infrt::KernelOp kernel_op = worklist.back();
-    worklist.pop_back();
-    // std::string kernel_name = kernel_op.name().str();
-    // std::vector<PhiKernelDesc> candidates =
-    //     getCandidateKernels(kernel_name, valid_places_);
+
+  mlir::OpBuilder builder(&block, block.begin());
+  std::map<infrt::TargetType, mlir::Value> phi_context;
+  for (infrt::KernelOp kernel_op : worklist) {
+    std::string kernel_name = kernel_op.name().str();
+    std::vector<infrt::PhiKernelDesc> candidates =
+        getCandidateKernels(kernel_name, valid_places_);
+    if (candidates.empty()) {
+      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
+      continue;
+    }
+    builder.setInsertionPoint(kernel_op);
+
+    // Todo: Implimentation the concrete pass pick strategy
+    const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front();
+
+    kernel_name =
+        infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
+        kernel_name +
+        infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
+        infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
+
+    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
+    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
+
+    if (phi_context.find(phi_kernel_desc.kernelType.target) ==
+        phi_context.end()) {
+      switch (phi_kernel_desc.kernelType.target) {
+        case infrt::TargetType::CPU: {
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateCPUContextOp>(
+                      kernel_op.getLoc(),
+                      infrt::phi::ContextType::get(kernel_op.getContext(),
+                                                   infrt::TargetType::CPU))
+                  .output();
+          phi_context[infrt::TargetType::CPU] = context_value;
+        } break;
+        case infrt::TargetType::GPU:
+        case infrt::TargetType::UNK:
+        default:
+          LOG(FATAL) << "Unsupported TargetType";
+          break;
+      }
+    }
+    operation_state.addOperands(
+        phi_context.at(phi_kernel_desc.kernelType.target));
+    for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) {
+      mlir::Value input = kernel_op.getOperand(index);
+      auto cvt_tensor_type_op = builder.create<infrt::CvtTensorOp>(
+          kernel_op.getLoc(),
+          infrt::DenseTensorType::get(
+              kernel_op.getContext(),
+              phi_kernel_desc.inputsType[index].target,
+              phi_kernel_desc.inputsType[index].precision,
+              phi_kernel_desc.inputsType[index].layout),
+          input);
+      operation_state.addOperands(cvt_tensor_type_op.output());
+    }
+    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
+         ++index) {
+      operation_state.addTypes(infrt::DenseTensorType::get(
+          kernel_op.getContext(),
+          phi_kernel_desc.outputsType[index].target,
+          phi_kernel_desc.outputsType[index].precision,
+          phi_kernel_desc.outputsType[index].layout));
+    }
+    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
+    mlir::Operation *phi_operation = builder.createOperation(operation_state);
+    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
+         ++index) {
+      mlir::Value input = phi_operation->getResult(index);
+      auto cvt_tensor_type_op = builder.create<infrt::CvtTensorOp>(
+          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
+      kernel_op.getResult(index).replaceAllUsesWith(
+          cvt_tensor_type_op.output());
+    }
+    kernel_op.erase();
   }
 }
-}  // namespace infrt
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass(
+    std::vector<Place> valid_places) {
+  return std::make_unique<phiOpCvtPass>(valid_places);
+}
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
index 051fee9b61a24772ff2295280fa1b0a1588d7bae..8b1944042aa7c42fef87786af0d0fa131c6f0535 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
@@ -14,44 +14,14 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
 /*
  * phiOpCvtPass.
- *
- * Convert the general operators in pd Dialect to a infrt.kernelOp.
- *
- * source func:
- *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "pd.conv2d"(%a) ...
- *  %d = "pd.conv3d"(%c) ...
- *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
- * }
- *
- * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "infrt.kernel"(%a){name = "conv2d"} ...
- *  %d = "infrt.kernel"(%c){name = "conv3d"}...
- *  %f = "infrt.kernel"(%a){name = "conv2d"}...
- *  "pd.fetch" (%d, %f)
- * }
+ * Convert the general operators from pd Dialect to phi dialect.
  */
-class phiOpCvtPass
-    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
- public:
-  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
-  void runOnFunction() override;
-  explicit phiOpCvtPass(std::vector<Place> valid_places = std::vector<Place>())
-      : valid_places_(valid_places) {}
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass(
+    std::vector<Place> valid_places = std::vector<Place>());
 
- private:
-  void convertStage();
-  void diapatchStage();
-  std::vector<Place> valid_places_;
-};
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index ca8a22a7e75d33de6e9f510aea5aab8c24255c36..e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
-class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
+class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
  public:
   // only support op in pd dialect
   explicit ProtoArgumentMappingContext(mlir::Operation* op)
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
index 4e99661a6a20590e7d36c1cf3a0e1e5d334b2464..a2808a00cb67da582ce4fc8b995772725d79e47e 100644
--- a/paddle/infrt/dialect/phi/phi_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,37 +11,46 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <llvm/Support/CommandLine.h>
-#include <mlir/Pass/PassManager.h>
-#include <iostream>
-#include <string>
-#include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 
-int main(int argc, char** argv) {
-  static llvm::cl::opt<std::string> input_file(
-      llvm::cl::Positional,
-      llvm::cl::desc("Specify input filename"),
-      llvm::cl::init("-"));
-
-  llvm::cl::ParseCommandLineOptions(argc, argv);
+#include "paddle/infrt/host_context/paddle_mlir.h"
 
-  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
-  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+void print_usage() {
+  std::cout << "Error inputs format, two kinds of inputs are supported:\n";
+  std::cout << "    [1] ./paddle-mlir-convert $path_to_model_file "
+               "$path_to_params_file\n";
+  std::cout << "    [2] ./paddle-mlir-convert $path_to_model_dir(__model__ + "
+               "params)\n";
+}
 
-  module->dump();
-  mlir::PassManager pm(context);
+bool parse_inputs(int argc,
+                  char** argv,
+                  std::string* model_file_name,
+                  std::string* params_file_name) {
+  switch (argc) {
+    case 1: {
+      print_usage();
+      return false;
+    }
+    case 2: {
+      *model_file_name = std::string(argv[1]) + std::string("/__model__");
+      *params_file_name = std::string(argv[1]) + std::string("/params");
+      return true;
+    }
+    case 3: {
+      *model_file_name = argv[1];
+      *params_file_name = argv[2];
+      return true;
+    }
+    default: { return false; }
+  }
+}
 
-  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
-  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
-                                             infrt::PrecisionType::FLOAT32,
-                                             infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
-  if (mlir::failed(pm.run(*module))) {
-    std::cout << "\npass failed!\n" << std::endl;
-    return 4;
+int main(int argc, char** argv) {
+  std::string model_file_name;
+  std::string params_file_name;
+  if (parse_inputs(argc, argv, &model_file_name, &params_file_name)) {
+    MLIRModelGenImpl myGen;
+    auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+    module_.dump();
   }
-  module->dump();
-  return 0;
 }
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de61dba8e744c88f279761520ac1815bb265d875
--- /dev/null
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+int main(int argc, char** argv) {
+  static llvm::cl::opt<std::string> input_file(
+      llvm::cl::Positional,
+      llvm::cl::desc("Specify input filename"),
+      llvm::cl::init("-"));
+
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+  context->loadAllAvailableDialects();
+  module->dump();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+  module->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index a37df265955e70cdf735f251bc8853c7ad4fe831..b118a5f7a9caf42f4aa63dd0222e7a2647addac5 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -31,7 +31,7 @@
 #include <iostream>
 
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/init_dialects.h"
 
 namespace cl = llvm::cl;
 
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td
index 5e228fed4d57eb283705c725797c42c5da133c3f..62e7471a390dfeee1a9ddfc15033e85db0adca2e 100644
--- a/paddle/infrt/dialect/rewrite.td
+++ b/paddle/infrt/dialect/rewrite.td
@@ -1,7 +1,7 @@
 #ifndef INFRT_REWRITE
 #define INFRT_REWRITE
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/pd_ops.td"
 include "paddle/infrt/dialect/pd_extra_ops.td"
diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td
index d3714c8ed14d3f1aea50ec4c55a9c4c2fb85e958..2be21d6aa772020519e3d909c9bdf7232f7ff985 100644
--- a/paddle/infrt/dialect/tensor_shape.td
+++ b/paddle/infrt/dialect/tensor_shape.td
@@ -2,7 +2,7 @@
 #else
 #define INFRT_OPS
 
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/tensor_shape_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 701391a750354938efe3703ef8642b21f8a878ea..46c250b05492cefe61d8e677a352a217718189b8 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -2,12 +2,12 @@
 #define PD_LOWER_TO_TRT
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "paddle/infrt/dialect/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
-        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">, ConstantAttr<SI32Attr, "1">),
+        (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr<F32Attr, "1.0">),
         (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>;
 
 //TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM
diff --git a/paddle/infrt/dialect/tensorrt/trt_dialect_types.h b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3edcec1edb2d7774038e638f8c20f94a3e7166
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/Types.h"
+
+namespace infrt {
+namespace trt {
+
+class EngineType
+    : public mlir::Type::TypeBase<EngineType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static EngineType get();
+  static EngineType get(mlir::MLIRContext *context);
+};
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index fa0095363c5fd34778162fb4e3204450ef1e7815..ad6b136463a71dcc2fcd9ce2b4e2da6f68e88dd2 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -53,9 +53,9 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
-                                    CreateEngineOp first,
-                                    CreateEngineOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             mlir::pd::GraphOp first,
+                             mlir::pd::GraphOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,8 +84,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op =
-      builder.create<CreateEngineOp>(loc, return_types, inputs, true);
+  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -98,7 +97,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<::infrt::dialect::ReturnOp>(loc, outputs);
+  builder.create<::infrt::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
@@ -150,12 +149,13 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+      mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        CreateEngineOp user_graph_op =
-            ::llvm::dyn_cast_or_null<CreateEngineOp>(user_op);
+        mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
@@ -168,7 +168,7 @@ void TRTGraphFusePass::runOnFunction() {
         // Reverse DFS from the source_nodes.
         if (!reverseDfs(source_nodes,
                         [&op](const mlir::Operation *n) { return n == &op; })) {
-          mergeTwoAdjacentCreateEngineOp(builder, graph_op, user_graph_op);
+          mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
         }
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index 350add905aac75c0ba8527aa6e9bc1510fab876c..18afba19e06189294078bcfc1a0b2bb341eb7126 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -14,8 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,40 +24,37 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" %m
+ *     infrt.return %m...
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" %m
+ *      infrt.return %m...
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" %m
+ *      infrt.return %m...
  *  } ...
- *  "pd.fetch" %d, %f
+ *  infrt.return %d, %f :...
+ * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     infrt.return %n, %s:...
  *  } ...
- *  "pd.fetch" %d, %f
+ *  infrt.return %d, %f:...
  * }
  */
 class TRTGraphFusePass
     : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 5ee7b23213a0106d3491712c37d34940f7c15c58..e3a7b455024c65d40ccbafb28fba9e9b0ead0369 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -16,23 +16,23 @@
 
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<CreateEngineOp> worklist;
+  std::vector<mlir::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+    mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    CreateEngineOp graph_op = worklist.back();
+    mlir::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 28078e2bc2dbff46d6a9eaf5522b949f68785898..a5dd4f14b2946fe232b7b725f6ace7caf74ff4d4 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -14,8 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -27,33 +25,29 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" (%n, %s)
+ *     infrt.return %n, %s : ...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f : ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f:...
  * }
  */
 class TRTGraphSplitPass
     : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   void runOnFunction() override;
   explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td
index 5722f17d597870c585328f236f72f8896b72835e..128960ee03e03029ac3681b0184e4359c0436dde 100755
--- a/paddle/infrt/dialect/tensorrt/trt_op_base.td
+++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td
@@ -27,6 +27,9 @@ class TRT_PaddleAttr <string name, string description> :
       Attr<CPred<"$_self.isa<mlir::trt::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
+def TRT_EngineType :
+      Type<CPred<"$_self.isa<::infrt::trt::EngineType>()">, "!trt.engine">,
+      BuildableType<"getType<::infrt::trt::EngineType>()">;
 
 //===----------------------------------------------------------------------===//
 // PaddlePaddle type definitions
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 8d81e739d9c72ebcaa57b927a360864db59d7e97..83bebdb6bf19bdf8f75d11d693813b8169e297a0 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -14,14 +14,65 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include <mlir/IR/Builders.h>
 #include <mlir/Transforms/DialectConversion.h>
-#include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
+struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
+  explicit PD2TRT_GraphLower(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
+    auto casted_op = ::llvm::dyn_cast<mlir::pd::GraphOp>(op);
+    ::mlir::Operation::operand_range inputs = casted_op.inputs();
+    auto ods_loc = rewriter.getFusedLoc(op->getLoc());
+    CreateEngineOp create_engine_op;
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> trt_inputs;
+    for (auto v : inputs) {
+      trt_inputs.push_back(v);
+    }
+    create_engine_op = rewriter.create<CreateEngineOp>(
+        ods_loc,
+        ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
+        trt_inputs,
+        true /*run_once*/);
+    ::mlir::Block *block = new ::mlir::Block;
+    block->getOperations().splice(block->begin(),
+                                  casted_op.getBody()->getOperations(),
+                                  casted_op.getBody()->begin(),
+                                  casted_op.getBody()->end());
+    create_engine_op.body().push_back(block);
+
+    // trt.execute
+    // outputs
+    ::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types;
+    for (auto v : casted_op.getODSResults(0)) {
+      execute_outputs_types.push_back(v.getType());
+    }
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> execute_inputs(
+        create_engine_op.getODSResults(0));
+    for (auto v : inputs) {
+      execute_inputs.push_back(v);
+    }
+    auto execute_op = rewriter.create<ExecuteOp>(
+        ods_loc, execute_outputs_types, execute_inputs);
+
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(op, replace_values);
+    return ::mlir::success();
+  }
+};
+
 void TRTOpConverterPass::runOnOperation() {
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
@@ -36,6 +87,7 @@ void TRTOpConverterPass::runOnOperation() {
   // the set of patterns that will lower the TensorRT operations.
   ::mlir::RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
+  patterns.add<PD2TRT_GraphLower>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index a8128a585ee82dc60811c65b1105beb33e8c3b18..ede64f8bcd556a73b779fc3b772bf3fa8f74eaf9 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
@@ -23,27 +24,26 @@ namespace trt {
  * trtOpConverterPass.
  *
  * source ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     infrt.return %n, %s:...
  *   } ...
- *   "pd.fetch" %d, %f
+ *   infrt.return %d, %f:...
  * }
  *
  * destination ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %engine = "trt.create_engine"(%a) ({
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "Infrt.return" %n, %s
- *   } ...
- *   "pd.fetch" %d, %f
+ *     infrt.return %n, %s :...
+ *   }){run_once = true} ...
+ *   %d, %f = "trt.execute"(%engine, %a)...
+ *   infrt.return %d, %f :...
  * }
  */
 struct TRTOpConverterPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 17e893a383a9cd3f893e80181858dc3cc2b0552b..9f348b4122fc74033703c92459e6cfa5b3a1f3a2 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -15,7 +15,8 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
@@ -37,11 +38,11 @@ void TRTOpTellerPass::runOnFunction() {
     if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<CreateEngineOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<CreateEngineOp>(
-        loc, op->getResultTypes(), op->getOperands(), true);
+    auto graph_op = builder.create<mlir::pd::GraphOp>(
+        loc, op->getResultTypes(), op->getOperands());
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
@@ -54,7 +55,7 @@ void TRTOpTellerPass::runOnFunction() {
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<::infrt::dialect::ReturnOp>(loc, op->getResults());
+    builder.create<::infrt::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 471eafa9f9ba33dad4182ba7da55a607c2bf8f0d..1cb08dc0a2161eeb5720191bada52f9b54e94893 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -14,8 +14,6 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,30 +24,28 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f: ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" (%m)
+ *     infrt.return %m:...
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" (%m)
+ *      infrt.return %m:...
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" (%m)
+ *      infrt.return %m:...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  infrt.return %d, %f:...
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
@@ -57,9 +53,7 @@ namespace trt {
 class TRTOpTellerPass
     : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 35b7967892cafcea66c382e5681ee43480b02735..d5222976625a2adece9a87c8952dba10137ae9ba 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,25 +11,58 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/OpImplementation.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
+EngineType EngineType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+EngineType EngineType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
 TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
     : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
+  addTypes<EngineType>();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
 }
 
+mlir::Type TensorRTDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+  // parse trt dilaect types, for example: !trt.engine
+  if (keyword == "engine") {
+    return infrt::trt::EngineType::get(getContext());
+  }
+  parser.emitError(parser.getCurrentLocation(), "unknown infrt::trt type: ")
+      << keyword;
+  return mlir::Type();
+}
+
+void TensorRTDialect::printType(mlir::Type type,
+                                mlir::DialectAsmPrinter &printer) const {
+  // print trt dilaect types, for example: !trt.engien
+  if (type.isa<infrt::trt::EngineType>()) {
+    printer << "engine";
+    return;
+  }
+  llvm_unreachable("unknown infrt::trt type.");
+}
+
 }  // namespace trt
 }  // namespace infrt
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 95b2ed41fdfe9c5fdc7832dba46427528aee1332..78d960b5120454bdd01b779abedbe2f7ec0d5853 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,15 +28,20 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
 namespace trt {
 
 class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext *context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const;  // NOLINT
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const;  // NOLINT
 };
 
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 31142a5157bfcd544128671fbdf22a993f1cc646..132a1d7805bdb85af8716e384ec29357a6ff68ad 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,14 +7,24 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
-def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::dialect::ReturnOp">]> {
-  let summary = "trt Graph Op";
+
+def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
+  let summary = "trt CreateEngine Op";
   let description = [{
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
   let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs Variadic<TRT_Tensor>:$outputs);
+  let results = (outs TRT_EngineType:$output);
+}
+
+def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
+  let summary = "trt execute Op";
+  let description = [{
+    Describe a tensorrt runtime.
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
+  let results = (outs Variadic<TRT_Tensor>:$output);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
index 1a7ea854c9ce469ee5719743287b4ee1b5de9286..843b12ced21a982b18b5a63f7bbef1d4d24eea16 100644
--- a/paddle/infrt/external_kernels/basic.mlir
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -1,7 +1,7 @@
 // CHECK: basic
 func @basic() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
   %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
@@ -17,5 +17,5 @@ func @basic() -> f32 {
   // CHECK: 6
   "external.print.f32"(%v3) : (f32) -> ()
 
-  Infrt.return %v3 : f32
+  infrt.return %v3 : f32
 }
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
index b0cabddc3ebc4a9ede73d506ac58acaa140f03d5..26b2d24cace70455d4a0e21dddf23c9bd628ae81 100644
--- a/paddle/infrt/external_kernels/fc.mlir
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -1,43 +1,43 @@
 // CHECK-LABEL: @fc
-func @fc(%input : !Infrt.tensor<X86, NCHW, F32>,
-         %w : !Infrt.tensor<X86, NCHW, F32>,
-         %bias : !Infrt.tensor<X86, NCHW, F32>) -> !Infrt.tensor<X86, NCHW, F32>
+func @fc(%input : !infrt.dense_tensor<CPU, FP32, NCHW>,
+         %w : !infrt.dense_tensor<CPU, FP32, NCHW>,
+         %bias : !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 {
-  %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  // dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
 
   // fc1
-  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
 
   // fc2
-  "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%out, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
 
-  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
+  infrt.return %out : !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @benchmark
 func @benchmark() {
-  %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
 
-  Infrt.benchmark "add.f32"(
-          %input:!Infrt.tensor<X86, NCHW, F32>,
-          %w:!Infrt.tensor<X86, NCHW, F32>,
-          %bias:!Infrt.tensor<X86, NCHW, F32>)
+  infrt.benchmark "add.f32"(
+          %input:!infrt.dense_tensor<CPU, FP32, NCHW>,
+          %w:!infrt.dense_tensor<CPU, FP32, NCHW>,
+          %bias:!infrt.dense_tensor<CPU, FP32, NCHW>)
           duration_secs = 100, max_count = 300000, num_warmup_runs = 3
   {
-    %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> (!Infrt.tensor<X86, NCHW, F32>)
-    Infrt.return %res : !Infrt.tensor<X86, NCHW, F32>
+    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return %res : !infrt.dense_tensor<CPU, FP32, NCHW>
   }
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
index d55d9904b5bc4e43388abacf9e4b62bf06db458b..97781e5c8c5e544bba53b561f2adcae16263886f 100644
--- a/paddle/infrt/external_kernels/paddle.mlir
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -1,50 +1,50 @@
 // CHECK: paddle_func
 func @paddle_func() -> () {
-  %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
 
-  %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
 
-  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%input : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%input : !infrt.dense_tensor<CPU, FP32, NCHW>)
   // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-  dt.print_tensor (%w : !Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%bias : !Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%w : !infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%bias : !infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.matmul
-  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out1 : !Infrt.tensor<X86, NCHW, F32>)
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out1 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.elementwise_add
-  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
-  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out2 : !Infrt.tensor<X86, NCHW, F32>)
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.relu
-  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.relu"(%out1, %out3) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out3 : !Infrt.tensor<X86, NCHW, F32>)
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out3 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // test external.sigmoid
-  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out4 : !Infrt.tensor<X86, NCHW, F32>)
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  dt.print_tensor (%out4 : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt
index 11304742ecd41345aa45bb4e35e064d9745bc42f..14cbea70ca8415a2c53b0bbfb76750d8ad7354eb 100644
--- a/paddle/infrt/host_context/CMakeLists.txt
+++ b/paddle/infrt/host_context/CMakeLists.txt
@@ -12,6 +12,7 @@ gather_srcs(infrt_src SRCS
     function.cc
     mlir_function_executable.cc
     mlir_program_executor.cc
+    paddle_mlir.cc
     )
 
 cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS})
@@ -21,7 +22,7 @@ cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${ML
 cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS})
 cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
-add_executable(paddle-mlir-convert paddle_mlir.cc paddle_mlir_converter.cc)
+add_executable(paddle-mlir-convert paddle_mlir_converter.cc)
 target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS})
 add_executable(infrtexec mlir_exec.cc)
 target_link_libraries(infrtexec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
index 14e88be4b96bb58df87db3191db8bae444c4cc3d..266c145f47839afb31d708d0863e2e90905253ee 100644
--- a/paddle/infrt/host_context/kernel_frame.cc
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -30,28 +30,21 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
 std::string KernelFrame::DumpArgTypes() const {
   std::stringstream ss;
   for (auto* value : GetValues(0, GetNumElements())) {
-    if (value->is_type<bool>()) {
-      ss << "bool (" << &value->get<bool>() << "), ";
-    } else if (value->is_type<tensor::DenseHostTensor>()) {
-      ss << "DenseHostTensor(" << &value->get<tensor::DenseHostTensor>()
-         << "), ";
-    } else if (value->is_type<float>()) {
-      ss << "float(" << &value->get<float>() << "), ";
-    } else if (value->is_type<int>()) {
-      ss << "int(" << &value->get<int>() << "), ";
-    } else if (value->is_type<phi::DenseTensor>()) {
-      ss << "phi::DenseTensor(" << &value->get<phi::DenseTensor>() << "), ";
-    } else if (value->is_type<phi::MetaTensor>()) {
-      ss << "phi::MetaTensor(" << &value->get<phi::MetaTensor>() << "), ";
-    } else if (value->is_type<::phi::CPUContext>()) {
-      ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), ";
-    } else if (value->is_type<host_context::None>()) {
-      ss << "none(" << &value->get<host_context::None>() << "), ";
-    } else if (value->is_type<backends::CpuPhiContext>()) {
-      ss << "CpuPhiContext(" << &value->get<backends::CpuPhiContext>() << "), ";
-    } else {
-      ss << "typeid: " << value->index() << ", ";
-    }
+#define DUMP(type_name)                                    \
+  if (value->is_type<type_name>()) {                       \
+    ss << #type_name << &value->get<type_name>() << "), "; \
+  }
+    DUMP(bool);
+    DUMP(tensor::DenseHostTensor);
+    DUMP(float);
+    DUMP(int);
+    DUMP(::phi::DenseTensor);
+    DUMP(::phi::MetaTensor);
+    DUMP(::phi::CPUContext);
+    DUMP(host_context::None);
+    DUMP(backends::CpuPhiContext);
+#undef DUMP
+    ss << "typeid: " << value->index() << ", ";
   }
   return ss.str();
 }
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
index f343dfc71b040e77308b30c2963fb4014221e29c..4209b2a9648d8be0a9a3897c27c7a35113cba424 100644
--- a/paddle/infrt/host_context/kernel_registry.cc
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -23,8 +23,9 @@ namespace infrt {
 namespace host_context {
 
 struct KernelRegistry::Impl {
-  std::unordered_map<std::string, KernelImplementation> data;
-  std::unordered_map<std::string, llvm::SmallVector<std::string, 4>> attr_names;
+  std::unordered_map<std::string,
+                     std::pair<KernelImplementation, std::vector<const char *>>>
+      data;
 };
 
 KernelRegistry::KernelRegistry() : impl_(std::make_unique<Impl>()) {}
@@ -33,20 +34,29 @@ void KernelRegistry::AddKernel(const std::string &key,
                                KernelImplementation fn) {
   CHECK(!impl_->data.count(key)) << "kernel [" << key
                                  << "] is registered twice";
-  impl_->data.emplace(key, fn);
+  impl_->data.emplace(
+      key, std::make_pair(std::move(fn), std::vector<const char *>{}));
 }
 
-void KernelRegistry::AddKernelAttrNameList(
-    const std::string &key, const std::vector<std::string> &names) {
-  CHECK(!impl_->attr_names.count(key))
-      << "kernel [" << key << "] is registered twice in attribute names";
-  impl_->attr_names.emplace(
-      key, llvm::SmallVector<std::string, 4>(names.begin(), names.end()));
+const std::vector<const char *> &KernelRegistry::GetAttrNameList(
+    const std::string &key) const {
+  CHECK(impl_->data.count(key));
+  return impl_->data[key].second;
+}
+
+void KernelRegistry::AddKernelWithAttrs(
+    const std::string &key,
+    KernelImplementation fn,
+    std::vector<const char *> &&attr_order) {
+  CHECK(!impl_->data.count(key)) << "kernel [" << key
+                                 << "] is registered twice";
+  impl_->data.emplace(key,
+                      std::make_pair(std::move(fn), std::move(attr_order)));
 }
 
 KernelImplementation KernelRegistry::GetKernel(const std::string &key) const {
   auto it = impl_->data.find(key);
-  return it != impl_->data.end() ? it->second : KernelImplementation{};
+  return it != impl_->data.end() ? it->second.first : KernelImplementation{};
 }
 
 std::vector<std::string> KernelRegistry::GetKernelList() const {
diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h
index a813f690efb0b3d36b7575d0889652f0868a2d85..a146b2b3c4c1e1090b5ac1843466b93a31b0bb0b 100644
--- a/paddle/infrt/host_context/kernel_registry.h
+++ b/paddle/infrt/host_context/kernel_registry.h
@@ -34,10 +34,14 @@ class KernelRegistry {
   KernelRegistry();
 
   void AddKernel(const std::string &key, KernelImplementation fn);
-  void AddKernelAttrNameList(const std::string &key,
-                             const std::vector<std::string> &names);
+  void AddKernelWithAttrs(const std::string &key,
+                          KernelImplementation fn,
+                          std::vector<const char *> &&attrs_order);
 
   KernelImplementation GetKernel(const std::string &key) const;
+  const std::vector<const char *> &GetAttrNameList(
+      const std::string &key) const;
+
   std::vector<std::string> GetKernelList() const;
 
   size_t size() const;
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 7823681079f672b61a53bd1c1a02e1a209ddaffe..1506282f6268191a2eece5540d30fbe90d8eeb52 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <llvm/Support/CommandLine.h>
-
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
 
@@ -29,6 +29,8 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
 #endif
@@ -81,6 +83,24 @@ int main(int argc, char** argv) {
     }
   }
 
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+#ifdef INFRT_WITH_PHI
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+#endif
+
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+
   host_context::TestMlir(module.get(), &registry);
 
   std::cout << std::endl;
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
index 47ec27ebec300f1cedd57b11e0dd1e6b37611141..ec8d43f99bae770f28cbf1b1bdc269536b4e7100 100644
--- a/paddle/infrt/host_context/mlir_function_executable.cc
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -43,6 +43,7 @@ MlirFunctionExecutable::MlirFunctionExecutable(
                func_op.getNumResults()),
       MlirToRuntimeTranslator(&core_runtime_builder_),
       region_(&func_op.getRegion()),
+      kernel_registry_(kernel_registry),
       core_runtime_builder_(kernel_registry),
       function_table_(function_table) {}
 
@@ -54,6 +55,7 @@ MlirFunctionExecutable::MlirFunctionExecutable(
     : Function("", func_type.getNumInputs(), func_type.getNumResults()),
       MlirToRuntimeTranslator(&core_runtime_builder_),
       region_(region),
+      kernel_registry_(kernel_registry),
       core_runtime_builder_(kernel_registry),
       function_table_(function_table) {}
 
@@ -90,7 +92,7 @@ void MlirFunctionExecutable::BuildExecutables(
 
     if (EmitCallOp(&op, &function_table_)) continue;
 
-    if (EmitGeneralOp(&op)) continue;
+    if (EmitGeneralOp(&op, *kernel_registry_)) continue;
     LOG(FATAL) << "Not supported op: " << DumpToString(op);
   }
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
index a6428df86e6b27061d92856970682bc29499d825..cd9161d01bbf648c344ec2a82747d997b810856a 100644
--- a/paddle/infrt/host_context/mlir_function_executable.h
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -70,6 +70,7 @@ class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator {
 
  private:
   mlir::Region* region_{};
+  KernelRegistry* kernel_registry_{};
   CoreRuntimeBuilder core_runtime_builder_;
   MlirToRuntimeTranslator::function_defs_t& function_table_;
   std::function<void()> copy_res_fn_;
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
index 1b55b408f2b082c09d06d51037e8c9d967a171f4..263d5884134b143aa8d3403c5cd05672df39636f 100644
--- a/paddle/infrt/host_context/mlir_tests/basic.mlir
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -1,30 +1,30 @@
 // CHECK-LABEL: basic
 func @basic() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  "infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 2
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
   // CHECK: 3
-  "Infrt.print.f32"(%v2) : (f32) -> ()
+  "infrt.print.f32"(%v2) : (f32) -> ()
 
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
   // CHECK: 6
-  "Infrt.print.f32"(%v3) : (f32) -> ()
+  "infrt.print.f32"(%v3) : (f32) -> ()
 
-  Infrt.return %v3 : f32
+  infrt.return %v3 : f32
 }
 
 // CHECK-LABEL: basic1
 // Check the mlir executor can work with more than one function in a file.
 func @basic1() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  "Infrt.print.f32"(%v0) : (f32) -> ()
+  %v0 = infrt.constant.f32 1.0
+  "infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 1
-  Infrt.return
+  infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
index 5a973a3eb23e6015ede2d69d83ab8c26de669908..1a7fa28f1e58bd400671099f5af7bedbb3c04e4d 100644
--- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -1,9 +1,9 @@
 // CHECK-LABEL: build_tensor1
 func @build_tensor1() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
index 22df1c8010d8dbd6a4b8e332e01602b4421ebcdd..691ce62cbf82ad4dc0d3b0199a9c1d1127213de5 100644
--- a/paddle/infrt/host_context/mlir_tests/shape.mlir
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -3,5 +3,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
-}
\ No newline at end of file
+  infrt.return
+}
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 17e6f7cb563d25186f9a76de8fe67af2ddb90e7b..c613843cd1779599fbac5aea6042b26b151534e8 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -75,7 +75,7 @@ struct MlirToRuntimeTranslator::Impl {
 };
 
 bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
-  if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant"))
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
     return false;
   VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
           << "]";
@@ -173,6 +173,36 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::TargetAttr>()) return boost::none;
+  if (attr.isa<::infrt::TargetAttr>()) {
+    return attr.cast<::infrt::TargetAttr>().getTarget();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::LayoutAttr>()) return boost::none;
+  if (attr.isa<::infrt::LayoutAttr>()) {
+    return attr.cast<::infrt::LayoutAttr>().getLayout();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none;
+  if (attr.isa<::infrt::PrecisionAttr>()) {
+    return attr.cast<::infrt::PrecisionAttr>().getPrecision();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -237,10 +267,11 @@ boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
 }
 
 static bool IsReturn(mlir::Operation* op) {
-  return op->getName().getStringRef() == "Infrt.return";
+  return op->getName().getStringRef() == "infrt.return";
 }
 
-bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
+bool MlirToRuntimeTranslator::EmitGeneralOp(
+    mlir::Operation* op, const KernelRegistry& kernel_registry) {
   CHECK(impl_->runtime);
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
@@ -278,35 +309,80 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
   // process attributes
   auto attrs = op->getAttrs();
 
+  // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its
+  // elements
+  // are sorted by name. The following code adapts the order of function
+  // signatures
+  // of the phi operator library.
+  llvm::SmallVector<Value*, 4> tmp;
+  tmp.resize(attrs.size());
+  const std::string& kernel_name = op->getName().getStringRef().str();
+  const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name);
+  if (attrs.size() && attr_names.empty()) {
+    LOG(WARNING) << "The kernel `" << kernel_name
+                 << "` has no specified attr order.";
+  }
+  auto get_offset = [](const char* attr,
+                       const std::vector<const char*>& names,
+                       const std::string& kernel_name) -> int {
+    for (size_t i = 0; i < names.size(); ++i) {
+      if (!std::strcmp(attr, names[i])) {
+        return i;
+      }
+    }
+    LOG(WARNING) << "The attribute `" << attr << "` of kernel `" << kernel_name
+                 << "` is not properly registered with "
+                    "`KernelRegistry::AddKernelWithAttrs()`.";
+    return -1;
+  };
+
   for (size_t i = 0; i < attrs.size(); i++) {
     auto& attr = attrs[i];
+    int offset{};
+    if (attr_names.size()) {
+      offset = get_offset(attr.getName().data(), attr_names, kernel_name);
+    } else {
+      offset = i;
+    }
+    CHECK_NE(offset, -1);
     if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<float>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<double>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(*v));
+      tmp[offset] = new Value(*v);
+    } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) {
+      tmp[offset] = new Value(*v);
+    } else if (auto v =
+                   EmitAttribute<::infrt::PrecisionType>(attr.getValue())) {
+      tmp[offset] = new Value(*v);
+    } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) {
+      tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<int64_t>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<float>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else if (auto v = EmitAttribute<std::vector<double>>(attr.getValue())) {
-      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+      tmp[offset] = new Value(std::move(*v));
     } else {
       LOG(FATAL) << "Not supported attribute type";
     }
   }
 
+  for (size_t i = 0; i < tmp.size(); i++) {
+    impl_->cur_op->AppendAttribute(tmp[i]);
+  }
+
   // process results
   llvm::SmallVector<Value*, 4> res_values;
   for (int i = 0, e = op->getNumResults(); i < e; i++) {
@@ -368,7 +444,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
 bool MlirToRuntimeTranslator::EmitReturnOp(
     mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
   CHECK(results);
-  if (op->getName().getStringRef() == "Infrt.return") {
+  if (op->getName().getStringRef() == "infrt.return") {
     for (size_t i = 0; i < op->getNumOperands(); i++) {
       results->push_back(op->getOperand(i));
     }
@@ -441,7 +517,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
                                          function_defs_t* function_table) {
   CHECK(op);
   CHECK(function_table);
-  if (op->getName().getStringRef() != "Infrt.call") return false;
+  if (op->getName().getStringRef() != "infrt.call") return false;
 
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
@@ -561,7 +637,7 @@ class MlirProgramTestExecutor : public MlirToRuntimeTranslator {
         llvm::SmallVector<mlir::Value, 3> results;
         if (EmitReturnOp(&op, &results)) continue;
         if (EmitCallOp(&op, &impl_->func_defs)) continue;
-        if (EmitGeneralOp(&op)) continue;
+        if (EmitGeneralOp(&op, *registry)) continue;
         LOG(FATAL) << "Not supported op: " << DumpToString(op);
       }
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 0c453651d9e6dc44adaf108ec6a1b0df984fe8be..27a7f20168667daddd353e902d49479aa612e38f 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -57,13 +57,14 @@ class MlirToRuntimeTranslator {
  protected:
   //! Emit a "infrt.constant.*" operation, return true if succeed.
   bool EmitConstantOp(mlir::Operation* op);
-  //! Emit a "Infrt.return" operation.
+  //! Emit a "infrt.return" operation.
   bool EmitReturnOp(mlir::Operation* op,
                     llvm::SmallVectorImpl<mlir::Value>* results);
   //! Emit a "ts.build_shape" operation.
   bool EmitBuildShapeOp(mlir::Operation* op);
   //! Emit an operation other than the special cases above.
-  bool EmitGeneralOp(mlir::Operation* op);
+  bool EmitGeneralOp(mlir::Operation* op,
+                     const KernelRegistry& kernel_registry);
   //! Emit all the functions.
   bool EmitFunctions();
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 5824e40abf97a4d63543948d056e815bbeebce3a..31615fbc3f6e46f55ddc5f56641750feb0972772 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
-  Infrt.return
+  infrt.return
 }
 )ROC";
 
@@ -63,14 +63,14 @@ TEST(TestMlir, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "Infrt.print.f32"(%v1) : (f32) -> ()
+  "infrt.print.f32"(%v1) : (f32) -> ()
 
-  Infrt.return
+  infrt.return
 }
 )ROC";
 
@@ -101,7 +101,7 @@ func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<
       "!infrt.dense_tensor<CPU, FP32, NCHW>";
 
   auto end = R"ROC(
-Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
   )ROC";
 
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 1c36b04f366bf5d993cc686804f9bb58aeb36f2b..18c25827b8ec5a71907e694cea4e7680b598e883 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -19,7 +19,6 @@ MLIRModelGenImpl::MLIRModelGenImpl()
     : context_(infrt::Global::getMLIRContext()), builder_(context_) {
   context_->allowUnregisteredDialects();
   context_->getOrLoadDialect<mlir::StandardOpsDialect>();
-  context_->getOrLoadDialect<infrt::dialect::INFRTDialect>();
   context_->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
   context_->getOrLoadDialect<infrt::dt::DTDialect>();
   context_->getOrLoadDialect<mlir::pd::PaddleDialect>();
@@ -56,6 +55,7 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
+
   return module_;
 }
 
@@ -78,7 +78,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
 llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
     const infrt::paddle::framework_proto::ProgramDesc &program) {
   llvm::SmallVector<mlir::Type, 4> operandTypes;
-  operandTypes.push_back(infrt::dt::TensorMapType::get(context_));
+  operandTypes.push_back(infrt::DenseTensorMapType::get(context_));
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() != "feed") continue;
     for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
@@ -143,13 +143,14 @@ void MLIRModelGenImpl::UpdateModelParams(
     const infrt::paddle::framework_proto::ProgramDesc &program,
     mlir::FuncOp *mainFunc) {
   // update input vars
+  int input_index = 1;
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() == "feed") {
       for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
         // update input variables
         auto &in = op_desc.outputs()[var_idx];
         std::string input_var_name = in.arguments(0);
-        ::mlir::Value input_ = mainFunc->getArgument(1);
+        ::mlir::Value input_ = mainFunc->getArgument(input_index++);
         params_map_.insert(
             std::pair<std::string, mlir::Value>(input_var_name, input_));
       }
@@ -211,7 +212,6 @@ void MLIRModelGenImpl::buildOperation(
     const infrt::paddle::framework_proto::OpDesc &op_) {
   const std::string &op_name = "pd." + op_.type();
   mlir::Location loc = mlir::UnknownLoc::get(context_);
-
   llvm::SmallVector<mlir::Value, 4> operands = GetOpInputValue(op_);
   llvm::SmallVector<mlir::Type, 4> resultTypes = GetOpOutputType(op_);
   llvm::SmallVector<mlir::NamedAttribute, 4> attrs = GetOpAttributes(op_);
@@ -227,7 +227,6 @@ llvm::SmallVector<mlir::Value, 4> MLIRModelGenImpl::GetOpInputValue(
   std::unordered_map<std::string, uint8_t> inputs_info = {};
   if (pd_dialect_inputs_info_map_.count(op_.type()))
     inputs_info = pd_dialect_inputs_info_map_.at(op_.type());
-
   for (int var_idx = 0; var_idx < op_.inputs_size(); ++var_idx) {
     auto &var = op_.inputs(var_idx);
     if (!var.arguments().empty()) {
@@ -249,10 +248,8 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
   // update op outputs info
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
     auto &var_name = op_.outputs(var_idx).arguments()[0];
-
     if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
       continue;
-
     // update persistable tensors
     for (int i = 0; i < main_block_.vars_size(); i++) {
       auto var_desc = main_block_.vars(i);
@@ -315,7 +312,6 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
   llvm::ArrayRef<mlir::StringAttr> attr_names_ =
       registered_op_name_.getAttributeNames();
   std::vector<mlir::StringAttr> attr_names_vec_ = attr_names_.vec();
-
   // update attrs
   for (int attrs_num = 0; attrs_num < op_.attrs_size(); attrs_num++) {
     auto attr_name_ = op_.attrs(attrs_num).name();
@@ -351,11 +347,17 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
 void MLIRModelGenImpl::RegisterOpOutputVars(
     const infrt::paddle::framework_proto::OpDesc &op_,
     mlir::Operation *mlir_op_) {
+  std::unordered_map<std::string, uint8_t> pd_dialect_outputs_info =
+      pd_dialect_outputs_info_map_.at(op_.type());
+
   // op outputs
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
+    if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
+      continue;
     auto &var_name = op_.outputs(var_idx).arguments()[0];
+    int index = pd_dialect_outputs_info[op_.outputs(var_idx).parameter()];
     // output name
-    auto var_ = mlir_op_->getResult(var_idx);
+    auto var_ = mlir_op_->getResult(index);
     params_map_.insert(std::pair<std::string, mlir::Value>(var_name, var_));
   }
 }
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index 78dfefcfda2c83760492766507999322152187eb..e825cbb5a11ea0dfcacfc2b1bbb63bf201219c9d 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -25,10 +25,10 @@
 #include "mlir/IR/MLIRContext.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/common/string.h"
-#include "paddle/infrt/dialect/basic_kernels.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+
+#include "paddle/infrt/dialect/init_dialects.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/paddle/model_parser.h"
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 3f40490557290fcc34a188882c4d4d251f4ba16e..822ee108c897cb9b221ada4bd1d5e6cf04e13e98 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,14 +24,6 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPhiContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::CPUContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::DenseTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::MetaTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
 
 const char* Value::type_info() const { return __type_info__; }
 
@@ -67,6 +59,10 @@ void CopyTo(const Value& from, Value* to) {
           to->data = reinterpret_cast<std::vector<int64_t> const&>(arg);
         else if (std::is_same<T, tensor::TensorMap>::value)
           to->data = reinterpret_cast<tensor::TensorMap const&>(arg);
+#ifdef INFRT_WITH_PHI
+        else if (std::is_same<T, ::phi::DenseTensor>::value)
+          to->data = reinterpret_cast<::phi::DenseTensor const&>(arg);
+#endif
         else
           LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
       },
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 7e7d77d3af741443d490dcfdd5b9ee6677b557ef..957d852442b10620244e230a2f7704eb7fa0a33e 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -22,6 +22,7 @@
 
 #include "paddle/infrt/common/object.h"
 #include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/function.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
@@ -64,13 +65,15 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
+            ::infrt::PrecisionType,
+            ::infrt::LayoutType,
+            ::infrt::TargetType,
 #ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPhiAllocator,
             backends::CpuPhiContext,
             ::phi::CPUContext,
-            std::vector<phi::DenseTensor>,
+            std::vector<const phi::DenseTensor*>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
             std::vector<phi::MetaTensor*>,
@@ -101,6 +104,9 @@ class Value : public common::Object {
   explicit Value(float x) : data(x) {}
   explicit Value(double x) : data(x) {}
   explicit Value(bool x) : data(x) {}
+  explicit Value(::infrt::TargetType x) : data(x) {}
+  explicit Value(::infrt::LayoutType x) : data(x) {}
+  explicit Value(::infrt::PrecisionType x) : data(x) {}
   explicit Value(std::string x) : data(x) {}
   explicit Value(tensor::TensorMap&& x) : data(x) {}
   explicit Value(std::vector<int16_t>&& x) : data(x) {}
@@ -112,11 +118,10 @@ class Value : public common::Object {
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
 #ifdef INFRT_WITH_PHI
-  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
 #endif
 
   template <typename T>
@@ -179,10 +184,6 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(float val);
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
-  explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPhiContext&& x);
-  explicit ValueRef(::phi::CPUContext&& x);
-  explicit ValueRef(::phi::DenseTensor&& x);
 
   using common::Shared<Value>::get;
   using common::Shared<Value>::Reset;
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index 23e50a5ddc87427bbf0f49c559f185084e42c8ec..b186cfcfd2b355f97711ecc916e497c2916d4060 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -63,24 +63,24 @@ static void PrintString(const std::string &str) {
 void RegisterBasicKernels(host_context::KernelRegistry *registry) {
   RegisterIntBasicKernels(registry);
   RegisterFloatBasicKernels(registry);
-  registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString));
-  registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString));
+  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
 }
 
 void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add<int32_t>));
-  registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
-  registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
-  registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div<int32_t>));
-  registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
 }
 
 void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add<float>));
-  registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub<float>));
-  registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul<float>));
-  registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div<float>));
-  registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print<float>));
+  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
index 8b18aca0210860f4ae688f2133ffa022fda3195d..6cc94dbcce0775cb6b74f993bfdd262fd6a47e6f 100644
--- a/paddle/infrt/kernel/control_flow_kernels.cc
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -37,7 +37,7 @@ static void INFRTCall(
 }
 
 void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall));
+  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index 7055c0c06d5905fa738d8df72c7110fdd82a30d2..15882d23743b020c289269ef09ae5e05999201a8 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -8,7 +8,6 @@ gather_srcs(infrt_src SRCS
     registry.cc
     dense_tensor_kernels.cc
     context_kernels.cc
-    allocator_kernels.cc
 )
 
 set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 3caaf1788e3f8ada1fdbf7a69b2bc435b89a8eeb..39ef172fadef9e0f6317dec192c251c6a1df6828 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,12 +18,11 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(
-    infrt::backends::CpuPhiAllocator* allocator) {
-  ::phi::CPUContext context;
-  context.SetAllocator(allocator);
-  context.Init();
-  return context;
+::phi::CPUContext CreateCPUContext() {
+  ::phi::CPUContext ctx{};
+  ctx.Init();
+  ctx.SetAllocator(new backends::CpuPhiAllocator{});
+  return ctx;
 }
 
 }  // namespace phi
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 7f1e7ef6cd356ce2c2e1c3cb3378d990b59cde36..3e9580b91da5724b42c72224847e45715f47dbb7 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -16,13 +16,14 @@
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*);
+::phi::CPUContext CreateCPUContext();
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 871336e8762e87577486f3422ed4de06d3cc03ba..777fb29ac60d9c7125898752747bbdf553f370c0 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,28 +13,33 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
-#include <iostream>
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
+
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod) {
-  return ::phi::DenseTensor(allocator,
-                            ::phi::DenseTensorMeta(::phi::DataType::FLOAT32,
-                                                   ::phi::make_ddim(dims.get()),
-                                                   ::phi::DataLayout::NCHW,
-                                                   {}));
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             ConvertLayoutToPhi(layout.get()),
+                             {}));
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<float>> values) {
+                        host_context::Attribute<std::vector<float>> value) {
   auto place = ::phi::CPUPlace();
   float* a_data = dense_tensor->mutable_data<float>(place);
   for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
-    a_data[i] = (values.get())[i];
+    a_data[i] = (value.get())[i];
   }
 }
 
@@ -52,7 +57,7 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
 
   ::phi::DDim dims = dense_tensor->dims();
   std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
-            << " values=[";
+            << " value=[";
   switch (dense_tensor->dtype()) {
     PRINT_META_DATA(FLOAT32, float);
     PRINT_META_DATA(INT32, int32_t);
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 920c0b1c8af427e2ecbfef25ab56f96c8a19daf1..8cc0e39e0e4431f073ac37a7f0557f2c837dc753 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -22,10 +23,12 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod);
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 37f9197edb728b599c3846ea83dd06d2c7e79865..08c2e19deddfe480faec6d5468b3f222abee7e03 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("phi_cpu.add.any.float32");
+  auto creator = registry.GetKernel("phi_cpu.add.float32.any");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 165f7f7c94377f8b9c1f9c240ee1418cab922cdc..75e3ebbf00ca54ed3fb2d0ca22bb7819300d0b2b 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -24,7 +24,8 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
        frame->GetValues(1, frame->GetNumElements() - 1)) {
     // TODO(Superjomn) To extend this.
     if (value->is_type<::phi::DenseTensor>()) {
-      values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
+      values.emplace_back(new host_context::Value{
+          ::phi::MetaTensor{&value->get<::phi::DenseTensor>()}});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index a0a5b391ea669b1358b14098e32750d709e52fe2..75c9e554778dcf1488289c6e9e46fb9652f677dd 100644
--- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -24,26 +24,6 @@
 namespace infrt {
 namespace kernel {
 
-static void FakePhiInferShape(const ::phi::MetaTensor& a,
-                              const ::phi::MetaTensor& b,
-                              bool arg_0,
-                              bool arg_1,
-                              ::phi::MetaTensor* c) {
-  LOG(INFO) << "the ptr of c: " << c;
-  LOG(INFO) << "c->numel(): " << c->numel();
-}
-
-static void FakePhiKernel(const ::phi::CPUContext& /*Context*/,
-                          const ::phi::DenseTensor& a,
-                          const ::phi::DenseTensor& b,
-                          bool arg_0,
-                          bool arg_1,
-                          ::phi::DenseTensor* c) {
-  std::cout << "@FakePhiKernel@" << std::endl;
-  LOG(INFO) << "the ptr of c: " << c;
-  LOG(INFO) << "c->numel(): " << c->numel();
-}
-
 template <typename KernelFunc,
           KernelFunc kernel,
           typename InferShapedFunc,
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 15e2d21005e03caefa67a5a336ee0b287abc5aaa..0e071418603f8390ca3283f617b06cf1fa91b94c 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -19,7 +19,6 @@
 
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
@@ -33,28 +32,18 @@ namespace infrt {
 namespace kernel {
 
 void RegisterPhiKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("phi_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
   registry->AddKernel("phi_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
-  registry->AddKernel(
-      "phi_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
-  registry->AddKernel("phi_dt.fill_dense_tensor.f32",
-                      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor",
+      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.fill_dense_tensor.f32",
+      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32),
+      {"value"});
   registry->AddKernel("phi_dt.print_tensor",
                       INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
-  registry->AddKernel(
-      "phi_dt.fake_phi_kernel",
-      std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
-                                    &FakePhiKernel,
-                                    decltype(&FakePhiInferShape),
-                                    &FakePhiInferShape>,
-                KernelLauncher<decltype(&FakePhiKernel),
-                               &FakePhiKernel,
-                               decltype(&FakePhiInferShape),
-                               &FakePhiInferShape>(),
-                std::placeholders::_1));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 9de1350e97d1af31dc18a116ed7cb38bf0d2f4ef..b7503aa4ef35894dda514fdb7fa4336485323094 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -49,8 +49,8 @@ void FillTensorWithConstant(Attribute<T> v, DenseHostTensor *tensor) {
   MutableDTArrayView<T>(tensor).Fill(v.get());
 }
 
-TensorMap LoadParams(const std::string &path) {
-  return *(infrt::tensor::LoadParams(path));
+TensorMap LoadParams(Attribute<std::string> path) {
+  return *(infrt::tensor::LoadParams(path.get()));
 }
 
 DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
@@ -111,9 +111,9 @@ void NaiveMatmul(const DenseHostTensor &x,
 /// ===== Kernel end ====
 
 void RegisterTensorKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("dt.create_uninit_tensor.f32",
-                      INFRT_KERNEL(CreateUninitTensor<float>));
-  registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"});
+  registry->AddKernelWithAttrs("dt.create_uninit_tensor.f32",
+                               INFRT_KERNEL(CreateUninitTensor<float>),
+                               {"shape"});
   registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
   registry->AddKernel("dt.fill_tensor_with_constant.f32",
                       INFRT_KERNEL(FillTensorWithConstant<float>));
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index d15bbe221f91a87b047863121f32699175183c54..bcf475d1bc09dab8be1b7a23359e1eb935ee02e0 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -193,7 +193,7 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
 }
 
 void RegisterTestKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
   registry->AddKernel("Infrt.test.shadow_copy_tensor",
                       INFRT_KERNEL(ShadowCopyTensor));
 }
diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt
deleted file mode 100755
index 51fecdf907798eb7280a17b294a263fe40993fe2..0000000000000000000000000000000000000000
--- a/paddle/infrt/pass/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(phi)
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index e5cc1ec1121fb7bbff2fad7856151916d8ea0924..5ce6d8673421ba3c53c9dad6d2fd1f20298f837a 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
-    DEPENDS infrtopt infrtexec)
+    DEPENDS infrtopt infrtexec phi-ir-exec)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir
index 2d4d6f2629ec7df989499f0a2e9649c01ae8428a..f534a3aa44aac964c262465da199ac926fa0904e 100644
--- a/paddle/infrt/tests/dialect/basic.mlir
+++ b/paddle/infrt/tests/dialect/basic.mlir
@@ -1,33 +1,33 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: @basic_f32
 func @basic_f32() -> f32 {
-  %v0 = Infrt.constant.f32 1.0
-  %v1 = Infrt.constant.f32 2.0
-  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK-NEXT: 3
-  "Infrt.print.f32"(%value) : (f32) -> ()
+  "infrt.print.f32"(%value) : (f32) -> ()
 
-  Infrt.return %value : f32
+  infrt.return %value : f32
 }
 
 /// ================================================================
 /// @caller call the other function @callee
 func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
-  %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32
-  %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
-  Infrt.return %z1 : f32
+  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  infrt.return %z1 : f32
 }
 
 // CHECK-LABEL: @caller.add.f32
 func @caller.add.f32() -> f32 {
-  %x = Infrt.constant.f32 1.0
-  %y = Infrt.constant.f32 2.0
-  %y1 = Infrt.constant.f32 3.0
-  %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+  %x = infrt.constant.f32 1.0
+  %y = infrt.constant.f32 2.0
+  %y1 = infrt.constant.f32 3.0
+  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
 
   // CHECK-NEXT: 6
-  "Infrt.print.f32"(%z) : (f32) -> ()
-  Infrt.return %z : f32
+  "infrt.print.f32"(%z) : (f32) -> ()
+  infrt.return %z : f32
 }
 /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir
index 381fd534f6a5a09e3091203de88ebf00101074af..1a57b43499062410b346b38412a533d3edd6fbcc 100644
--- a/paddle/infrt/tests/dialect/benchmark.mlir
+++ b/paddle/infrt/tests/dialect/benchmark.mlir
@@ -12,13 +12,13 @@ func @benchmark() {
   // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
   // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
   // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
-  Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
   {
-    %0 = Infrt.constant.f32 1.0
-    %1 = Infrt.constant.f32 2.0
-    %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32
-    "Infrt.print.f32"(%res) : (f32) -> ()
-    Infrt.return %res : f32
+    %0 = infrt.constant.f32 1.0
+    %1 = infrt.constant.f32 2.0
+    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "infrt.print.f32"(%res) : (f32) -> ()
+    infrt.return %res : f32
   }
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir
index faade62d35063b1d85c4c1d3ddad98b085a7726c..6dc9904610477139b6c254d0f9f7b754041a83cc 100644
--- a/paddle/infrt/tests/dialect/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/dense_tensor.mlir
@@ -4,14 +4,14 @@ func @dense_shape0() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return
+  infrt.return
 }
 
 func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
   %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
   %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
@@ -19,6 +19,6 @@ func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 8e2d3bc49b96c645fc72e33af6300307d855e5a4..936c8f32c01521817e185fa80e836018e7b02aa8 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -1,31 +1,30 @@
 // CHECK-LABEL: @predict
-func @predict(%input:!Infrt.tensor<X86, NCHW, F32>, %map: !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>) {
-  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor<X86, NCHW, F32>
-  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor<X86, NCHW, F32>
+func @predict(%input:!infrt.dense_tensor<CPU, FP32, NCHW>, %map: !infrt.dense_tensor_map) -> (!infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // fc
-  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
-  //dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
+  infrt.return %out : !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @main
 func @main() {
-  %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
 
-  %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model")
   // CHECK-LABEL: loading params
-  %map = dt.load_params(%path)
+  %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"}
 
-  %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
+  %out = infrt.call @predict(%input, %map): (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor_map) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  dt.print_tensor (%out : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir
index 48ee4b9d725c0aa36d4849c2842c99997de5c8ee..4b8055514936417dd83a6bb23afaea31eb2d1013 100644
--- a/paddle/infrt/tests/dialect/paddle_ops.mlir
+++ b/paddle/infrt/tests/dialect/paddle_ops.mlir
@@ -5,5 +5,5 @@ func @ops() {
   %b = pd.feed() {name="input1"}: tensor<?xf32>
   %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0>
   %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3657777a5b0bce1c5a5e4df8d59695f8b122da56
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -0,0 +1,16 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @sign_any_float32_execute
+func @sign_any_float32_execute() {
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+  %t = "phi_dt.create_dense_tensor" (%ctx) {
+    precision=#infrt.precision<FP32>, 
+    layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+
+  // CHECK: dense_tensor: shape=shape[1], value=[1]
+  "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  infrt.return
+}
+
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
similarity index 92%
rename from paddle/infrt/tests/dialect/pten/pten_pass.mlir
rename to paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 30ff2636ae5a41674883e63ff931629a0d140b84..61a66cb3d71a372bcd67cb96362abcb033768e4d 100644
--- a/paddle/infrt/tests/dialect/pten/pten_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt %s | FileCheck %s
+// RUN: phi-ir-exec %s
 // CHECK-LABEL: @ops
 func @ops() {
   %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5b0fa735897a31287bb6dea487e2f22eacd7b0aa
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -0,0 +1,15 @@
+// RUN: infrtexec -i %s
+module  {
+  func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
+    %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
+  }
+  func @main() {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+    %2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return
+  }
+}
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
deleted file mode 100644
index 695143c93b3cf13839429cc8d48fa9f7d255c033..0000000000000000000000000000000000000000
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: infrtexec -i %s | FileCheck %s
-
-// CHECK-LABEL: @sign_any_float32_execute
-func @sign_any_float32_execute() {
-  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
-  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context
-  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-
-  // CHECK: dense_tensor: shape=shape[1], values=[1]
-  "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  Infrt.return
-}
-
diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
index 76ae140dd6cbd741f992315ee35d3e94058d4674..47bc1f7833140c8a876660673fa11f148d42db90 100644
--- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
@@ -3,14 +3,14 @@
 func @dense_shape0() {
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return
+  infrt.return
 }
 
 func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
   %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
   %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
@@ -18,6 +18,6 @@ func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
   %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
-  Infrt.return
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
index 52b296e06cd365fbaa1249108f877dc9f7480ff0..d6b69fdd595ea520f623e4b9651fc6e2b321c26f 100644
--- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
+++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
@@ -13,7 +13,7 @@ func @naive_elementwise_add() {
   // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
   dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
 
 // RUN: infrtexec -i %s | FileCheck %s
@@ -31,5 +31,5 @@ func @naive_matmul() {
   // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16]
   dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 5c1396d47f551618bcdf95ef55c875aa2cb0d684..7aeb3f8a4d0513deaed6bda73a591790b633d0db 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -1,15 +1,14 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
 func @load_tensor_map() {
-  %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
-  %map = dt.load_params(%path)
+  %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
   %size = dt.tensor_map_get_size(%map) -> i32
-  Infrt.print.i32 %size
+  infrt.print.i32 %size
 
   %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2], values=[0, 0]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
index 5623aef71aa2c33ff0bd3524855c56e9dcab5e9b..09210078b9d7d139f2bc2534acf07e83aa1146bb 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
index e580634055a72eae66196f67c8321c308599a1af..5847d567cf6b42a9404d33a938a67c6dc2f4aefc 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
@@ -6,5 +6,5 @@ func @test_tensor_type() {
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir
index 5623aef71aa2c33ff0bd3524855c56e9dcab5e9b..09210078b9d7d139f2bc2534acf07e83aa1146bb 100644
--- a/paddle/infrt/tests/dialect/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir
index e580634055a72eae66196f67c8321c308599a1af..5847d567cf6b42a9404d33a938a67c6dc2f4aefc 100644
--- a/paddle/infrt/tests/dialect/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor_type.mlir
@@ -6,5 +6,5 @@ func @test_tensor_type() {
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  Infrt.return
+  infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index 49510bc542dc0409067b5d61cb189dfab8b6601f..e3cb9670bec015e58e2a538bb55dfbe7c8b7f554 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,13 +1,6 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main() -> tensor<?xf32> {
-  %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
-  %c = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
-  %b1 = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
-  %b2 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
-  %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
-  %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
-
+func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
   %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
@@ -19,5 +12,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
+  infrt.return %e2 : tensor<?xf32>
 }
diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in
index d47957dac928409ad4b49884db9c70310b38d9ca..fe35dc4b8b3d436de5ec6893a8927eb49f1ab67d 100644
--- a/paddle/infrt/tests/lit.cfg.py.in
+++ b/paddle/infrt/tests/lit.cfg.py.in
@@ -23,9 +23,10 @@ config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib")
 infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/")
 trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/")
 infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/")
+phi_ir_exec_bin = os.path.join(build_dir, "paddle/infrt/dialect/phi")
 
 llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/")
 config.environment['PATH'] = os.path.pathsep.join(
-    (infrtopt_bin, infrtexec_bin, trtexec_bin, llvm_bin, config.environment['PATH']))
+    (infrtopt_bin, infrtexec_bin, trtexec_bin, phi_ir_exec_bin, llvm_bin, config.environment['PATH']))
 
 config.suffixes = ['.mlir']
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index d632db046d15ca73837292a5cb1e44479ab2c6ed..a1b0af609ca8d52f148b4ffa6016fdbc49862677 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(lib)
-cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api)
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 7601696293a66d626a7fd417b32544d035921467..88660449b6821ef4cda2d1859c5551d0a00d59a6 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -86,19 +86,28 @@ class PADDLE_API CustomOpKernelContext {
   CustomOpKernelContext() = default;
 
   void EmplaceBackInput(Tensor&& input);
-  void EmplaceBackInputs(std::vector<Tensor>&& inputs);
+  void EmplaceBackInputs(const std::vector<Tensor>& inputs);
   void EmplaceBackOutput(Tensor&& output);
-  void EmplaceBackOutputs(std::vector<Tensor>&& outputs);
+  void EmplaceBackOutputs(const std::vector<Tensor>& outputs);
   void EmplaceBackAttr(paddle::any attr);
-
+  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs) {
+    attrs_ = std::move(attrs);
+  }
   const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
   const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;
 
   const Tensor& InputAt(size_t idx) const;
   std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
-
+  const std::vector<paddle::any>& Attrs() const { return attrs_; }
+  const std::vector<std::pair<size_t, size_t>>& InputRange() {
+    return input_range_;
+  }
+  const std::vector<std::pair<size_t, size_t>>& OutputRange() {
+    return output_range_;
+  }
   Tensor* MutableOutputAt(size_t idx);
   std::vector<Tensor*> MutableOutputBetweeen(size_t start, size_t end);
+  std::vector<Tensor> OutputsBetweeen(size_t start, size_t end);
   std::vector<Tensor>* AllMutableOutput();
 
   template <typename AttrType>
@@ -552,7 +561,6 @@ class PADDLE_API OpMetaInfo {
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   std::vector<std::string> attrs_;
-
   // 2. func info
   KernelFunc kernel_fn_{nullptr};
   InferShapeFunc infer_shape_fn_{nullptr};
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 5edb83f8c3fc01d198d3f63b64047b9e45cd747b..42bf7a8103f837195775b33daf301a7d2e0f4c44 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -32,6 +32,30 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
 set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
 
+# sparse api file
+set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
+set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
+set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
+set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
+set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp)
+set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
+
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -73,6 +97,32 @@ add_custom_command(
   DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
   VERBATIM)
 
+# generate sparse api
+add_custom_command(
+  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
+                 --api_yaml_path ${sparse_api_yaml_file}
+                 --api_header_path ${sparse_api_header_file_tmp}
+                 --api_source_path ${sparse_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
+  COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file}
+  VERBATIM)
+
+# generate backward sparse api
+add_custom_command(
+  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file}
+                 --api_yaml_path ${sparse_bw_api_yaml_file}
+                 --api_header_path ${sparse_bw_api_header_file_tmp}
+                 --api_source_path ${sparse_bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file}
+  COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
+  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
+  VERBATIM)
+
 # generate wrapped infermeta
 add_custom_command(
   OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
@@ -87,12 +137,15 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
 cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
-cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 
-cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
-cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
-cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
-cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
+cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 19b113838eab5403aca00d9d97b278646228c512..fc1afb26bf4143e5c75398b3dc1042581e1f1546 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_gen_utils.cc
similarity index 59%
rename from paddle/phi/api/lib/api_utils.h
rename to paddle/phi/api/lib/api_gen_utils.cc
index 6c1fa97c0f52a697383a3526220cc758d778823d..e1ebe8c6465cfdd7f8213c0a31416bc77412221c 100644
--- a/paddle/phi/api/lib/api_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 
 namespace paddle {
 namespace experimental {
 
 /* ------------------ for input ----------------------- */
 
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
-    const Tensor& tensor) {
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
     const paddle::optional<Tensor>& tensor) {
   if (tensor) {
     return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
   return nullptr;
 }
 
-inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
     const std::vector<Tensor>& tensors) {
   auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
   pt_tensors->reserve(tensors.size());
@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
   return std::move(pt_tensors);
 }
 
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const Tensor& tensor) {
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
     const paddle::optional<Tensor>& tensor) {
   if (tensor) {
     return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
 
 /* ----------------- for infer_meta --------------------- */
 
-inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::DenseTensor&>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
@@ -79,21 +70,21 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
   return {paddle::none};
 }
 
-inline std::vector<phi::MetaTensor> MakeMetaTensor(
-    const std::vector<phi::DenseTensor>& tensors) {
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::DenseTensor*>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
   meta_tensors.reserve(tensors.size());
-  for (const auto& t : tensors) {
-    meta_tensors.emplace_back(t);
+  for (const auto* t : tensors) {
+    meta_tensors.emplace_back(*t);
   }
   return meta_tensors;
 }
 
-inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<const phi::SelectedRows&>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
 
 /* ------------------ for output ----------------------- */
 
-inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto dense_tensor = std::make_shared<phi::DenseTensor>(
         phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
 
-inline std::vector<phi::DenseTensor*> SetKernelOutput(
-    size_t out_size, Backend backend, std::vector<Tensor>* out) {
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out) {
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
   return results;
 }
 
-inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
-                                                      Tensor* out) {
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto select_rows = std::make_shared<phi::SelectedRows>();
     out->set_impl(select_rows);
@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
   return static_cast<phi::SelectedRows*>(out->impl().get());
 }
 
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
+  if (!out->initialized()) {
+    if (type == TensorType::SPARSE_COO) {
+      auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
+          phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else if (type == TensorType::SPARSE_CSR) {
+      auto sparse_tensor =
+          std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else {
+      auto dense_tensor = std::make_shared<phi::DenseTensor>();
+      out->set_impl(dense_tensor);
+      return dense_tensor.get();
+    }
+  }
+  return out->impl().get();
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..01625f651c3bd1deaae43f735ac03fb2bc3f4e25
--- /dev/null
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
+
+/* ------------------ for input ----------------------- */
+
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
+
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor);
+
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+    const std::vector<Tensor>& tensors);
+
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
+
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor);
+
+/* ----------------- for infer_meta --------------------- */
+
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
+
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor);
+
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::DenseTensor*>& tensors);
+
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
+
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor);
+
+/* ------------------ for output ----------------------- */
+
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
+
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out);
+
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
+
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index ae67e2ebb35ccef7fe07ee8c76db33a459b1dfce..79b8ac6d0b8352b2e817e6bdbefca74c835ad6b2 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 51d51c954de81d8e9116304e31374ce8d9934305..14dba664c41b3d7b138630c739bfe7b934d04e9f 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -51,7 +51,8 @@ void CustomOpKernelContext::EmplaceBackInput(Tensor&& input) {
   input_range_.emplace_back(std::make_pair(index, index + 1));
 }
 
-void CustomOpKernelContext::EmplaceBackInputs(std::vector<Tensor>&& inputs) {
+void CustomOpKernelContext::EmplaceBackInputs(
+    const std::vector<Tensor>& inputs) {
   size_t index = inputs_.size();
   input_range_.emplace_back(std::make_pair(index, index + inputs.size()));
   inputs_.insert(inputs_.end(),
@@ -65,7 +66,8 @@ void CustomOpKernelContext::EmplaceBackOutput(Tensor&& output) {
   output_range_.emplace_back(std::make_pair(index, index + 1));
 }
 
-void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
+void CustomOpKernelContext::EmplaceBackOutputs(
+    const std::vector<Tensor>& outputs) {
   size_t index = outputs_.size();
   output_range_.emplace_back(std::make_pair(index, index + outputs.size()));
   outputs_.insert(outputs_.end(),
@@ -75,6 +77,8 @@ void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
 
 void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
+  VLOG(7) << "attrs_ No." << attrs_.size() - 1
+          << " has value of type: " << attrs_[attrs_.size() - 1].type().name();
 }
 
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
@@ -102,6 +106,15 @@ std::vector<Tensor*> CustomOpKernelContext::MutableOutputBetweeen(size_t start,
   return rlt;
 }
 
+std::vector<Tensor> CustomOpKernelContext::OutputsBetweeen(size_t start,
+                                                           size_t end) {
+  std::vector<Tensor> rlt;
+  for (size_t i = start; i < end; ++i) {
+    rlt.emplace_back(outputs_.at(i));
+  }
+  return rlt;
+}
+
 std::vector<Tensor>* CustomOpKernelContext::AllMutableOutput() {
   return &outputs_;
 }
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
similarity index 86%
rename from paddle/phi/api/lib/sparse_api.cc
rename to paddle/phi/api/lib/sparse_api_custom_impl.cc
index 9e1f59c0aa74329b15efcbff123b137fbf0b1360..832c19361e5eb03419fe988c9a30304b5993afdf 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 
 #include <memory>
 #include "glog/logging.h"
@@ -20,31 +20,14 @@ limitations under the License. */
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
-#endif
 
 namespace paddle {
 namespace experimental {
 namespace sparse {
 
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
-                                Backend backend,
-                                const int64_t sparse_dim) {
+Tensor to_sparse_coo_impl(const Tensor& x,
+                          Backend backend,
+                          const int64_t sparse_dim) {
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     return x;
   }
@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
   return out;
 }
 
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     return x;
   }
@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
   return out;
 }
 
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x, Backend backend) {
   if (x.layout() != phi::DataLayout::SPARSE_CSR &&
       x.layout() != phi::DataLayout::SPARSE_COO) {
     return x;
diff --git a/paddle/phi/api/include/sparse_api.h b/paddle/phi/api/lib/sparse_api_custom_impl.h
similarity index 74%
rename from paddle/phi/api/include/sparse_api.h
rename to paddle/phi/api/lib/sparse_api_custom_impl.h
index a131804cd6f582c01586671a21851066910b21d4..293b2cfa3d33480ccccd0f601f8e15c639b93e1e 100644
--- a/paddle/phi/api/include/sparse_api.h
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.h
@@ -21,13 +21,13 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
-                                Backend backend,
-                                const int64_t sparse_dim);
+Tensor to_dense_impl(const Tensor& x, Backend backend);
 
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend);
+Tensor to_sparse_coo_impl(const Tensor& x,
+                          Backend backend,
+                          const int64_t sparse_dim);
 
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
 
 }  // namespace sparse
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 311dd0fc30941d2afb9f1bc1e7ae57f3a449a254..40174a505dcc9b3d475254b7cec7691300c7aecf 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -111,8 +111,8 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
                   "touching underlying data, this requires the total size of "
                   "the tensor to remain constant.";
   if (is_dense_tensor()) {
-    std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->set_meta(
-        phi::DenseTensorMeta(dtype(), phi::make_ddim(shape)));
+    std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->Resize(
+        phi::make_ddim(shape));
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support reshape operation on DenseTensor now."));
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index aefa26952d1e5f224112576bfbd74be80cca72cc..885e29b27fa8e723ad0e89f9a99c2accd3c172f6 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"
 
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 6d056b54b70058e33501083d9754aa27466c0f59..271a58222f0c0f6b60642482691bed635e4d5f3c 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar)
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 1c9f7c3a8683daaf26cb87b23e50284d0329c4a8..3d183ea7fee8b17a7037a3fd9a6b2999605d8e25 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -40,6 +40,13 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) {
   auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        tensor.numel(),
+        1UL,
+        platform::errors::InvalidArgument("The DenseTensor used to construct "
+                                          "the Scalar contains more than 1 "
+                                          "value, it contains `%d` values.",
+                                          tensor.numel()));
     if (!platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 43e477ef32e9c2a3d914447d610cd6f07b73a92a..5f616155546450b7f51c072307d700bd23920636 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -24,4 +24,11 @@ endif()
 
 if(WITH_CUSTOM_DEVICE)
   add_dependencies(phi_context custom_context)
+  cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
+  cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
+  cc_library(stream SRCS stream.cc DEPS callback_manager)
+  cc_library(event SRCS event.cc DEPS enforce place)
+  cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
+  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
+  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
 endif()
diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
similarity index 84%
rename from paddle/fluid/platform/device/callback_manager.cc
rename to paddle/phi/backends/callback_manager.cc
index c677bc0262f0cfba0a5995afbde9e04f4bb0337e..e21e8502d8f8c43e7484982354c4ea69253a195f 100644
--- a/paddle/fluid/platform/device/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/phi/backends/callback_manager.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 CallbackManager::CallbackManager(stream::Stream *stream)
     : stream_(stream), thread_pool_(1) {}
@@ -32,12 +31,12 @@ void CallbackManager::AddCallback(std::function<void()> callback) const {
     });
   });
 
-  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+  phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->AddCallback(stream_, func);
 }
 
 void CallbackManager::Wait() const {
-  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+  phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->SynchronizeStream(stream_);
 
   {
@@ -48,5 +47,4 @@ void CallbackManager::Wait() const {
   }
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/phi/backends/callback_manager.h
similarity index 91%
rename from paddle/fluid/platform/device/callback_manager.h
rename to paddle/phi/backends/callback_manager.h
index 0edc694c94bb7846ac6081bccc0dc7fecd61adcb..359958b7c93e2c4041532a377f35836ca8ae89bc 100644
--- a/paddle/fluid/platform/device/callback_manager.h
+++ b/paddle/phi/backends/callback_manager.h
@@ -30,10 +30,7 @@
 #include <memory>
 #include <mutex>  // NOLINT
 
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace platform {
+namespace phi {
 
 namespace stream {
 class Stream;
@@ -58,5 +55,4 @@ class CallbackManager {
   mutable std::future<void> last_future_;
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index cb54d3675687d9ae7145c9ac01bc874e811b08f7..5b46afb4ce9ee942fa00910b247a86b2aa6a3df7 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -1,3 +1,5 @@
 if (WITH_CUSTOM_DEVICE)
   cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager)
+  cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
+  cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context)
 endif()
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index bde3b6a08539b51e06442ef6090f99cbea7e9de9..e34e0f94b7067cecc59aacfdde7bb10f2c3b5b59 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/backends/custom/custom_context.h"
 
-#include "paddle/fluid/platform/device/device_guard.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 
 namespace phi {
 
@@ -25,8 +25,8 @@ struct CustomContext::Impl {
   ~Impl() {}
 
   void Init() {
-    paddle::platform::DeviceGuard guard(place_);
-    stream_.reset(new paddle::platform::stream::Stream());
+    phi::DeviceGuard guard(place_);
+    stream_.reset(new phi::stream::Stream());
     stream_->Init(place_);
   }
 
@@ -40,7 +40,7 @@ struct CustomContext::Impl {
 
   Place place_;
 
-  std::shared_ptr<paddle::platform::stream::Stream> stream_;
+  std::shared_ptr<phi::stream::Stream> stream_;
 };
 
 void CustomContext::Init() { impl_->Init(); }
diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
similarity index 81%
rename from paddle/fluid/platform/device/custom/custom_device.cc
rename to paddle/phi/backends/custom/custom_device.cc
index 09f0421a878ad8f1e3b4292515b19dde232868dc..df757b286a6b18190f314b0a908cb04a23afcbdb 100644
--- a/paddle/fluid/platform/device/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -12,23 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_base.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device/custom/enforce_custom.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/device_base.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 
 static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
   return d1.id == d2.id;
 }
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class CustomDevice : public DeviceInterface {
  public:
-  CustomDevice(const std::string& type, int priority, bool is_custom,
-               std::unique_ptr<C_DeviceInterface> pimpl, void* dso_handle)
+  CustomDevice(const std::string& type,
+               int priority,
+               bool is_custom,
+               std::unique_ptr<C_DeviceInterface> pimpl,
+               void* dso_handle)
       : DeviceInterface(type, priority, is_custom),
         pimpl_(std::move(pimpl)),
         dso_handle_(dso_handle) {
@@ -122,14 +127,15 @@ class CustomDevice : public DeviceInterface {
     return device.id;
   }
 
-  void CreateStream(size_t dev_id, stream::Stream* stream,
+  void CreateStream(size_t dev_id,
+                    stream::Stream* stream,
                     const stream::Stream::Priority& priority =
                         stream::Stream::Priority::kNormal,
                     const stream::Stream::Flag& flag =
                         stream::Stream::Flag::kDefaultFlag) override {
     if (priority != stream::Stream::Priority::kNormal ||
         flag != stream::Stream::Flag::kDefaultFlag) {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "priority != stream::Stream::Priority::kNormal || flag != "
           "stream::Stream::Flag::kDefaultFlag is not allowed on "
           "CustomDevice."));
@@ -162,23 +168,28 @@ class CustomDevice : public DeviceInterface {
       SynchronizeStream(dev_id, stream);
       return true;
     }
-    if (pimpl_->query_stream(device, reinterpret_cast<C_Stream>(
-                                         stream->raw_stream())) == C_SUCCESS) {
+    if (pimpl_->query_stream(
+            device, reinterpret_cast<C_Stream>(stream->raw_stream())) ==
+        C_SUCCESS) {
       return true;
     }
     return false;
   }
 
-  void AddCallback(size_t dev_id, stream::Stream* stream,
+  void AddCallback(size_t dev_id,
+                   stream::Stream* stream,
                    stream::Stream::Callback* callback) override {
     if (!pimpl_->stream_add_callback) {
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(phi::errors::Unavailable(
           "AddCallback is not supported on %s.", Type()));
     } else {
       const auto device = &devices_pool[dev_id];
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback(
-          device, reinterpret_cast<C_Stream>(stream->raw_stream()),
-          [](C_Device device, C_Stream stream, void* user_data,
+          device,
+          reinterpret_cast<C_Stream>(stream->raw_stream()),
+          [](C_Device device,
+             C_Stream stream,
+             void* user_data,
              C_Status* status) {
             std::unique_ptr<std::function<void()>> func(
                 reinterpret_cast<std::function<void()>*>(user_data));
@@ -188,7 +199,8 @@ class CustomDevice : public DeviceInterface {
     }
   }
 
-  void CreateEvent(size_t dev_id, event::Event* event,
+  void CreateEvent(size_t dev_id,
+                   event::Event* event,
                    event::Event::Flag flags) override {
     const auto device = &devices_pool[dev_id];
     C_Event c_event;
@@ -205,13 +217,15 @@ class CustomDevice : public DeviceInterface {
         device, reinterpret_cast<C_Event>(event->raw_event())));
   }
 
-  void RecordEvent(size_t dev_id, const event::Event* event,
+  void RecordEvent(size_t dev_id,
+                   const event::Event* event,
                    const stream::Stream* stream) override {
     const auto device = &devices_pool[dev_id];
 
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
-        reinterpret_cast<C_Event>(event->raw_event())));
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->record_event(device,
+                             reinterpret_cast<C_Stream>(stream->raw_stream()),
+                             reinterpret_cast<C_Event>(event->raw_event())));
   }
 
   void SynchronizeEvent(size_t dev_id, const event::Event* event) override {
@@ -228,78 +242,93 @@ class CustomDevice : public DeviceInterface {
       SynchronizeEvent(dev_id, event);
       return true;
     }
-    if (pimpl_->query_event(device, reinterpret_cast<C_Event>(
-                                        event->raw_event())) == C_SUCCESS) {
+    if (pimpl_->query_event(device,
+                            reinterpret_cast<C_Event>(event->raw_event())) ==
+        C_SUCCESS) {
       return true;
     }
     return false;
   }
 
-  void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+  void StreamWaitEvent(size_t dev_id,
+                       const stream::Stream* stream,
                        const event::Event* event) override {
     const auto device = &devices_pool[dev_id];
 
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        device,
+        reinterpret_cast<C_Stream>(stream->raw_stream()),
         reinterpret_cast<C_Event>(event->raw_event())));
   }
 
-  void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyH2D(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_h2d(device, dst, src, size));
     }
   }
 
-  void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyD2H(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_d2h(device, dst, src, size));
     }
   }
 
-  void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size,
+  void MemoryCopyD2D(size_t dev_id,
+                     void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     const auto device = &devices_pool[dev_id];
-    auto place = platform::CustomPlace(Type(), dev_id);
+    auto place = CustomPlace(Type(), dev_id);
 
     if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
     } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
+      paddle::platform::DeviceContextPool& pool =
+          paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->memory_copy_d2d(device, dst, src, size));
     }
   }
 
-  void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id,
-                     const void* src, size_t size,
+  void MemoryCopyP2P(const Place& dst_place,
+                     void* dst,
+                     size_t src_dev_id,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr) override {
     int dst_dev_id = PlaceToId(dst_place);
     auto dst_device = &devices_pool[dst_dev_id];
@@ -310,8 +339,12 @@ class CustomDevice : public DeviceInterface {
         MemoryCopyP2P(dst_place, dst, src_dev_id, src, size);
       } else {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p(
-            dst_device, src_device,
-            reinterpret_cast<C_Stream>(stream->raw_stream()), dst, src, size));
+            dst_device,
+            src_device,
+            reinterpret_cast<C_Stream>(stream->raw_stream()),
+            dst,
+            src,
+            size));
       }
     } else {
       if (!pimpl_->memory_copy_p2p) {
@@ -319,9 +352,9 @@ class CustomDevice : public DeviceInterface {
         MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
         MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
       } else {
-        auto src_place = platform::CustomPlace(Type(), src_dev_id);
-        platform::DeviceContextPool& pool =
-            platform::DeviceContextPool::Instance();
+        auto src_place = CustomPlace(Type(), src_dev_id);
+        paddle::platform::DeviceContextPool& pool =
+            paddle::platform::DeviceContextPool::Instance();
         pool.Get(src_place)->Wait();
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
@@ -350,8 +383,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_allocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryAllocateHost is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->host_memory_allocate(device, &ptr, size));
@@ -363,8 +396,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->host_memory_deallocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryDeallocateHost is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->host_memory_deallocate(device, ptr, size));
@@ -376,8 +409,8 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_allocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Unified is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryAllocateUnified is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->unified_memory_allocate(device, &ptr, size));
@@ -389,15 +422,17 @@ class CustomDevice : public DeviceInterface {
     const auto device = &devices_pool[dev_id];
 
     if (!pimpl_->unified_memory_deallocate) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "MemoryAllocKind::Host is not supported on %s.", Type()));
+      PADDLE_THROW(phi::errors::Unavailable(
+          "MemoryDeallocateUnified is not supported on %s.", Type()));
     } else {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->unified_memory_deallocate(device, ptr, size));
     }
   }
 
-  void MemorySet(size_t dev_id, void* ptr, uint8_t value,
+  void MemorySet(size_t dev_id,
+                 void* ptr,
+                 uint8_t value,
                  size_t size) override {
     const auto device = &devices_pool[dev_id];
 
@@ -532,10 +567,12 @@ class CustomDevice : public DeviceInterface {
 
   inline int PlaceToId(const Place& place) {
     int dev_id = PlaceToIdNoCheck(place);
-    PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(),
-                      platform::errors::NotFound(
+    PADDLE_ENFORCE_NE(devices_pool.find(dev_id),
+                      devices_pool.end(),
+                      phi::errors::NotFound(
                           "Cannot found %s %d, please check visible devices",
-                          Type(), dev_id));
+                          Type(),
+                          dev_id));
     return dev_id;
   }
 
@@ -623,11 +660,14 @@ typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
-                          const std::string& dso_lib_path, void* dso_handle) {
+                          const std::string& dso_lib_path,
+                          void* dso_handle) {
   if (ValidCustomCustomRuntimeParams(&runtime_params)) {
-    auto device =
-        std::make_unique<CustomDevice>(runtime_params.device_type, 255, true,
-                                       std::move(device_interface), dso_handle);
+    auto device = std::make_unique<CustomDevice>(runtime_params.device_type,
+                                                 255,
+                                                 true,
+                                                 std::move(device_interface),
+                                                 dso_handle);
     if (false == DeviceManager::Register(std::move(device))) {
       LOG(WARNING) << "Skipped lib [" << dso_lib_path
                    << "]. Register failed!!! there may be a "
@@ -665,10 +705,9 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle) {
                     "compatibility between PaddlePaddle and Custom Runtime.";
     return;
   }
-  LoadCustomRuntimeLib(runtime_params, std::move(device_interface),
-                       dso_lib_path, dso_handle);
+  LoadCustomRuntimeLib(
+      runtime_params, std::move(device_interface), dso_lib_path, dso_handle);
   LOG(INFO) << "Successed in loading custom runtime in lib: " << dso_lib_path;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
similarity index 86%
rename from paddle/fluid/platform/device/custom/custom_device_test.cc
rename to paddle/phi/backends/custom/custom_device_test.cc
index e42fbbb9448f970aaf3e3821aba061fae7693756..53b88f9b4ac7904b04e3786aae02719ce06f9204 100644
--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -17,9 +17,9 @@
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
-#include "paddle/fluid/platform/device/device_manager.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/custom/fake_cpu_device.h"
+#include "paddle/phi/backends/device_manager.h"
 
 void RegisterDevice() {
   CustomRuntimeParams runtime_params;
@@ -30,23 +30,22 @@ void RegisterDevice() {
   runtime_params.interface->size = sizeof(C_DeviceInterface);
 
   InitFakeCPUDevice(&runtime_params);
-  paddle::platform::LoadCustomRuntimeLib(
+  phi::LoadCustomRuntimeLib(
       runtime_params, std::move(device_interface), "", nullptr);
 }
 
 void InitDevice() {
   RegisterDevice();
-  EXPECT_GT(static_cast<int>(
-                paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
+  EXPECT_GT(static_cast<int>(phi::DeviceManager::GetAllDeviceTypes().size()),
             0);
   auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
-  auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
   EXPECT_NE(device, nullptr);
 
   std::vector<paddle::platform::Place> places;
-  auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  auto device_types = phi::DeviceManager::GetAllDeviceTypes();
   for (auto dev_type : device_types) {
-    auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
+    auto devices = phi::DeviceManager::GetDeviceList(dev_type);
     for (auto dev_id : devices) {
       places.push_back(
           paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
@@ -60,14 +59,14 @@ void InitDevice() {
 void TestDeviceInterface(const paddle::platform::Place& place) {
   std::cout << "TestDeviceInterface on " << place << std::endl;
   if (paddle::platform::is_custom_place(place)) {
-    auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+    auto device = phi::DeviceManager::GetDeviceWithPlace(place);
     auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
-    auto p1 = device->MemoryAllocate(
-        paddle::platform::DeviceManager::GetMinChunkSize(place));
+    auto p1 =
+        device->MemoryAllocate(phi::DeviceManager::GetMinChunkSize(place));
     EXPECT_NE(p1, nullptr);
 
-    paddle::platform::DeviceManager::SetDevice(place);
-    auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
+    phi::DeviceManager::SetDevice(place);
+    auto dev_id = phi::DeviceManager::GetDevice(dev_type);
     EXPECT_EQ(dev_id, place.GetDeviceId());
   }
 }
@@ -168,11 +167,10 @@ void TestTensorUtils(const paddle::platform::Place& place) {
 
 TEST(CustomDevice, Tensor) {
   InitDevice();
-  auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
   for (const auto& dev_type : dev_types) {
     std::cout << "Test on " << dev_type << std::endl;
-    EXPECT_GT(static_cast<int>(
-                  paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
+    EXPECT_GT(static_cast<int>(phi::DeviceManager::GetDeviceCount(dev_type)),
               0);
     auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
 
diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/phi/backends/custom/fake_cpu_device.h
similarity index 90%
rename from paddle/fluid/platform/device/custom/fake_cpu_device.h
rename to paddle/phi/backends/custom/fake_cpu_device.h
index c6d8ade4b08597b2c17e5df9dc333c3c4f70d69e..22c344a0e0488f9126b8bcd2f3ce99490e9e70bd 100644
--- a/paddle/fluid/platform/device/custom/fake_cpu_device.h
+++ b/paddle/phi/backends/custom/fake_cpu_device.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/phi/backends/device_ext.h"
 
 constexpr size_t global_total_memory = 1024 * 1024UL;
 static size_t global_free_memory = global_total_memory;
@@ -43,14 +43,19 @@ C_Status GetDevicesList(size_t *device) {
   return C_SUCCESS;
 }
 
-C_Status MemCpy(const C_Device device, void *dst, const void *src,
+C_Status MemCpy(const C_Device device,
+                void *dst,
+                const void *src,
                 size_t size) {
   memcpy(dst, src, size);
   return C_SUCCESS;
 }
 
-C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
-                     const void *src, size_t size) {
+C_Status AsyncMemCpy(const C_Device device,
+                     C_Stream stream,
+                     void *dst,
+                     const void *src,
+                     size_t size) {
   memcpy(dst, src, size);
   return C_SUCCESS;
 }
@@ -100,14 +105,16 @@ C_Status SyncStream(const C_Device device, C_Stream stream) {
 
 C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
 
-C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
+C_Status StreamWaitEvent(const C_Device device,
+                         C_Stream stream,
                          C_Event event) {
   return C_SUCCESS;
 }
 
 C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
 
-C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
+C_Status DeviceMemStats(const C_Device device,
+                        size_t *total_memory,
                         size_t *free_memory) {
   *total_memory = global_total_memory;
   *free_memory = global_free_memory;
@@ -139,7 +146,8 @@ void InitFakeCPUDevice(CustomRuntimeParams *params) {
   params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
   params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
 
-  memset(reinterpret_cast<void *>(params->interface), 0,
+  memset(reinterpret_cast<void *>(params->interface),
+         0,
          sizeof(C_DeviceInterface));
 
   params->interface->initialize = Init;
diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/phi/backends/device_base.cc
similarity index 68%
rename from paddle/fluid/platform/device/device_base.cc
rename to paddle/phi/backends/device_base.cc
index 6234c9612687e507acd2642ef1d39cc0f8da4539..14fe90192e5bcaedf50ba96a6a047d08f622a76d 100644
--- a/paddle/fluid/platform/device/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/phi/backends/device_base.h"
 #include "gflags/gflags.h"
+#include "paddle/phi/core/enforce.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -21,26 +22,25 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
-#define INTERFACE_UNIMPLEMENT                   \
-  PADDLE_THROW(platform::errors::Unimplemented( \
+#define INTERFACE_UNIMPLEMENT              \
+  PADDLE_THROW(phi::errors::Unimplemented( \
       "%s is not implemented on %s device.", __func__, Type()));
 
 // info
 size_t DeviceInterface::GetComputeCapability() {
-  VLOG(10) << Type() + " get compute capability " << 0;
+  VLOG(10) << Type() << " get compute capability " << 0;
   return 0;
 }
 
 size_t DeviceInterface::GetRuntimeVersion() {
-  VLOG(10) << Type() + " get runtime version " << 0;
+  VLOG(10) << Type() << " get runtime version " << 0;
   return 0;
 }
 
 size_t DeviceInterface::GetDriverVersion() {
-  VLOG(10) << Type() + " get driver version " << 0;
+  VLOG(10) << Type() << " get driver version " << 0;
   return 0;
 }
 
@@ -62,7 +62,8 @@ void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
 int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
 
 // stream manage
-void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
+void DeviceInterface::CreateStream(size_t dev_id,
+                                   stream::Stream* stream,
                                    const stream::Stream::Priority& priority,
                                    const stream::Stream::Flag& flag) {
   INTERFACE_UNIMPLEMENT;
@@ -82,7 +83,8 @@ bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
   return true;
 }
 
-void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
+void DeviceInterface::AddCallback(size_t dev_id,
+                                  stream::Stream* stream,
                                   stream::Stream::Callback* callback) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -94,7 +96,8 @@ void DeviceInterface::StreamWaitEvent(size_t dev_id,
 }
 
 // event manage
-void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
+void DeviceInterface::CreateEvent(size_t dev_id,
+                                  event::Event* event,
                                   event::Event::Flag flags) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -103,7 +106,8 @@ void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
+void DeviceInterface::RecordEvent(size_t dev_id,
+                                  const event::Event* event,
                                   const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -119,23 +123,35 @@ bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
 }
 
 // memery manage
-void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyH2D(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyD2H(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
-                                    size_t size, const stream::Stream* stream) {
+void DeviceInterface::MemoryCopyD2D(size_t dev_id,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size,
+                                    const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
-                                    size_t src_id, const void* src, size_t size,
+void DeviceInterface::MemoryCopyP2P(const Place& dst_place,
+                                    void* dst,
+                                    size_t src_id,
+                                    const void* src,
+                                    size_t size,
                                     const stream::Stream* stream) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -154,7 +170,8 @@ void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
   return nullptr;
 }
 
-void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
+void DeviceInterface::MemoryDeallocateHost(size_t dev_id,
+                                           void* ptr,
                                            size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -164,12 +181,15 @@ void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
   return nullptr;
 }
 
-void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
+void DeviceInterface::MemoryDeallocateUnified(size_t dev_id,
+                                              void* ptr,
                                               size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
 
-void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
+void DeviceInterface::MemorySet(size_t dev_id,
+                                void* ptr,
+                                uint8_t value,
                                 size_t size) {
   INTERFACE_UNIMPLEMENT;
 }
@@ -184,8 +204,9 @@ size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
 
 size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t available_to_alloc = AvailableAllocSize(dev_id);
-  PADDLE_ENFORCE_GT(available_to_alloc, 0,
-                    platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_GT(available_to_alloc,
+                    0,
+                    phi::errors::ResourceExhausted(
                         "Not enough available %s memory.", Type()));
   // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
   // allocated by fraction
@@ -194,8 +215,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t alloc_bytes =
       (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
                                            FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
-                    platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_GE(available_to_alloc,
+                    alloc_bytes,
+                    phi::errors::ResourceExhausted(
                         "Not enough available %s memory.", Type()));
   return alloc_bytes;
 }
@@ -217,33 +239,32 @@ size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
 
 size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
   size_t init_alloc_size = AllocSize(dev_id, false);
-  VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
+  VLOG(10) << Type() << " init alloc size " << (init_alloc_size >> 20) << "M";
   return init_alloc_size;
 }
 
 size_t DeviceInterface::GetReallocSize(size_t dev_id) {
   size_t realloc_size = AllocSize(dev_id, true);
-  VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
+  VLOG(10) << Type() << " realloc size " << (realloc_size >> 20) << "M";
   return realloc_size;
 }
 
 size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
   size_t max_alloc_size =
       std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
-  VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
+  VLOG(10) << Type() << " max alloc size " << (max_alloc_size >> 20) << "M";
   return max_alloc_size;
 }
 
 size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
   size_t max_chunk_size = GetMaxAllocSize(dev_id);
-  VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
+  VLOG(10) << Type() << " max chunk size " << (max_chunk_size >> 20) << "M";
   return max_chunk_size;
 }
 
 size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
-  VLOG(10) << Type() + " extra padding size " << 0;
+  VLOG(10) << Type() << " extra padding size " << 0;
   return 0;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/phi/backends/device_base.h
similarity index 80%
rename from paddle/fluid/platform/device/device_base.h
rename to paddle/phi/backends/device_base.h
index d70b02be80eacd9d492b8a8d40c0a074dfe9c6e3..b4964708dfb9797c75e6f69ccb8bae6853b424a9 100644
--- a/paddle/fluid/platform/device/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -14,11 +14,10 @@
 
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class DeviceInterface {  // Driver / Runtime
  public:
@@ -66,7 +65,8 @@ class DeviceInterface {  // Driver / Runtime
   // Stream
   // ! Create an asynchronous stream
   virtual void CreateStream(
-      size_t dev_id, stream::Stream* stream,
+      size_t dev_id,
+      stream::Stream* stream,
       const stream::Stream::Priority& priority =
           stream::Stream::Priority::kNormal,
       const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
@@ -81,19 +81,22 @@ class DeviceInterface {  // Driver / Runtime
   virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
 
   // ! Add a callback to a compute stream.
-  virtual void AddCallback(size_t dev_id, stream::Stream* stream,
+  virtual void AddCallback(size_t dev_id,
+                           stream::Stream* stream,
                            stream::Stream::Callback* callback);
 
   // Event
   // ! Create an event.
-  virtual void CreateEvent(size_t dev_id, event::Event* event,
+  virtual void CreateEvent(size_t dev_id,
+                           event::Event* event,
                            event::Event::Flag flags);
 
   // ! Destroy an event.
   virtual void DestroyEvent(size_t dev_id, event::Event* event);
 
   // ! Records an event.
-  virtual void RecordEvent(size_t dev_id, const event::Event* event,
+  virtual void RecordEvent(size_t dev_id,
+                           const event::Event* event,
                            const stream::Stream* stream);
 
   // ! Waits for event to complete.
@@ -102,24 +105,34 @@ class DeviceInterface {  // Driver / Runtime
   virtual bool QueryEvent(size_t dev_id, const event::Event* event);
 
   // ! Make a compute stream wait on an event
-  virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+  virtual void StreamWaitEvent(size_t dev_id,
+                               const stream::Stream* stream,
                                const event::Event* event);
 
   // Memory
-  virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyH2D(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyD2H(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+  virtual void MemoryCopyD2D(size_t dev_id,
+                             void* dst,
+                             const void* src,
                              size_t size,
                              const stream::Stream* stream = nullptr);
 
-  virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
-                             const void* src, size_t size,
+  virtual void MemoryCopyP2P(const Place& dst_place,
+                             void* dst,
+                             size_t src_id,
+                             const void* src,
+                             size_t size,
                              const stream::Stream* stream = nullptr);
 
   virtual void* MemoryAllocate(size_t dev_id, size_t size);
@@ -160,7 +173,6 @@ class DeviceInterface {  // Driver / Runtime
   size_t AvailableAllocSize(size_t dev_id);
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/phi/backends/device_ext.h
similarity index 76%
rename from paddle/fluid/platform/device/device_ext.h
rename to paddle/phi/backends/device_ext.h
index d1e1340f74b7741f867b85d7ab0b1e42c9621a47..6315fe15afdf1ecd9c7657396468320eda7d88c1 100644
--- a/paddle/fluid/platform/device/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -40,7 +40,9 @@ typedef struct C_Stream_st* C_Stream;
 
 typedef struct C_Event_st* C_Event;
 
-typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
+typedef void (*C_Callback)(C_Device device,
+                           C_Stream stream,
+                           void* user_data,
                            C_Status* status);
 
 struct C_DeviceInterface {
@@ -124,8 +126,10 @@ struct C_DeviceInterface {
    * @param[C_Callback] callback
    * @param[void*]      user_data
    */
-  C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
-                                  C_Callback callback, void* user_data);
+  C_Status (*stream_add_callback)(const C_Device device,
+                                  C_Stream stream,
+                                  C_Callback callback,
+                                  void* user_data);
 
   /**
    * @brief Create an event
@@ -142,7 +146,8 @@ struct C_DeviceInterface {
    * @param[C_Stream]   stream
    * @param[C_Event]    event
    */
-  C_Status (*record_event)(const C_Device device, C_Stream stream,
+  C_Status (*record_event)(const C_Device device,
+                           C_Stream stream,
                            C_Event event);
 
   /**
@@ -191,7 +196,8 @@ struct C_DeviceInterface {
    * @param[C_Stream]   stream
    * @param[C_Event]    event
    */
-  C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
+  C_Status (*stream_wait_event)(const C_Device device,
+                                C_Stream stream,
                                 C_Event event);
 
   void* reserved_dev_api[8];
@@ -207,7 +213,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*device_memory_allocate)(const C_Device device,
+                                     void** ptr,
                                      size_t size);
 
   /**
@@ -217,7 +224,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*device_memory_deallocate)(const C_Device device,
+                                       void* ptr,
                                        size_t size);
 
   /**
@@ -228,8 +236,10 @@ struct C_DeviceInterface {
    * @param[unsigned char] value
    * @param[size_t]     size
    */
-  C_Status (*device_memory_set)(const C_Device device, void* ptr,
-                                unsigned char value, size_t size);
+  C_Status (*device_memory_set)(const C_Device device,
+                                void* ptr,
+                                unsigned char value,
+                                size_t size);
 
   /**
    * @brief Host memory allocate
@@ -238,7 +248,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*host_memory_allocate)(const C_Device device,
+                                   void** ptr,
                                    size_t size);
 
   /**
@@ -248,7 +259,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*host_memory_deallocate)(const C_Device device,
+                                     void* ptr,
                                      size_t size);
 
   /**
@@ -258,7 +270,8 @@ struct C_DeviceInterface {
    * @param[void**]     ptr        Plugin allocate an address and fill it
    * @param[size_t]     size
    */
-  C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
+  C_Status (*unified_memory_allocate)(const C_Device device,
+                                      void** ptr,
                                       size_t size);
 
   /**
@@ -268,7 +281,8 @@ struct C_DeviceInterface {
    * @param[void*]      ptr
    * @param[size_t]     size
    */
-  C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
+  C_Status (*unified_memory_deallocate)(const C_Device device,
+                                        void* ptr,
                                         size_t size);
 
   /**
@@ -279,7 +293,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_h2d)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -290,7 +306,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_d2h)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -301,7 +319,9 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
+  C_Status (*memory_copy_d2d)(const C_Device device,
+                              void* dst,
+                              const void* src,
                               size_t size);
 
   /**
@@ -314,8 +334,10 @@ struct C_DeviceInterface {
    * @param[size_t]     size
    */
   C_Status (*memory_copy_p2p)(const C_Device dst_device,
-                              const C_Device src_device, void* dst,
-                              const void* src, size_t size);
+                              const C_Device src_device,
+                              void* dst,
+                              const void* src,
+                              size_t size);
 
   /**
    * @brief Asynchonrize memory copy from host to device
@@ -326,8 +348,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_h2d)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Asynchonrize memory copy from device to host
@@ -338,8 +363,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_d2h)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Asynchonrize memory copy from device to device
@@ -350,8 +378,11 @@ struct C_DeviceInterface {
    * @param[void*]      src
    * @param[size_t]     size
    */
-  C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+  C_Status (*async_memory_copy_d2d)(const C_Device device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   /**
    * @brief Peer asynchonrize memory copy from host to device
@@ -363,8 +394,11 @@ struct C_DeviceInterface {
    * @param[size_t]     size
    */
   C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
-                                    const C_Device src_device, C_Stream stream,
-                                    void* dst, const void* src, size_t size);
+                                    const C_Device src_device,
+                                    C_Stream stream,
+                                    void* dst,
+                                    const void* src,
+                                    size_t size);
 
   void* reserved_mem_api[8];
 
@@ -394,7 +428,8 @@ struct C_DeviceInterface {
    * @param[size_t*]    free_memory
    * @param[size_t*]    used_memory
    */
-  C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
+  C_Status (*device_memory_stats)(const C_Device device,
+                                  size_t* total_memory,
                                   size_t* free_memory);
 
   /**
@@ -488,6 +523,15 @@ struct CustomRuntimeParams {
   char reserved[32];
 };
 
+#define PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params)             \
+  if ((params)->size != sizeof(DevicePluginParams) &&           \
+      (params)->interface->size != sizeof(C_DeviceInterface)) { \
+    return;                                                     \
+  }                                                             \
+  (params)->version.major = PADDLE_DEVICE_PLUGIN_MAJOR_VERSION; \
+  (params)->version.minor = PADDLE_DEVICE_PLUGIN_MINOR_VERSION; \
+  (params)->version.patch = PADDLE_DEVICE_PLUGIN_PATCH_VERSION;
+
 // Plugin implement it and fill CustomRuntimeParams
 void InitPlugin(CustomRuntimeParams*);
 
diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/phi/backends/device_guard.cc
similarity index 83%
rename from paddle/fluid/platform/device/device_guard.cc
rename to paddle/phi/backends/device_guard.cc
index 55d8b9dc6a9a58dda5ae8192709e6858da878da7..03eaac1fb1aa15f36c55d466e8a79cea53035e51 100644
--- a/paddle/fluid/platform/device/device_guard.cc
+++ b/paddle/phi/backends/device_guard.cc
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/device_guard.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 // Even this source file does not contains any code, it is better to keep this
 // source file for cmake dependency.
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/phi/backends/device_guard.h
similarity index 82%
rename from paddle/fluid/platform/device/device_guard.h
rename to paddle/phi/backends/device_guard.h
index 638e9c984b4d25e474fd5949e9fdc5df98a344ef..eb14236d251b34a83e22d56bc7406b21cd5a84c5 100644
--- a/paddle/fluid/platform/device/device_guard.h
+++ b/paddle/phi/backends/device_guard.h
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class DeviceGuard {
  public:
   explicit inline DeviceGuard(const Place& place)
-      : dev_type_(PlaceHelper::GetDeviceType(place)) {
+      : dev_type_(place.GetDeviceType()) {
     prev_id = DeviceManager::GetDevice(dev_type_);
-    cur_id = PlaceHelper::GetDeviceId(place);
+    cur_id = place.GetDeviceId();
 
     if (cur_id != prev_id) {
       DeviceManager::SetDevice(dev_type_, cur_id);
@@ -44,5 +43,4 @@ class DeviceGuard {
   std::string dev_type_;
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/phi/backends/device_manager.cc
similarity index 83%
rename from paddle/fluid/platform/device/device_manager.cc
rename to paddle/phi/backends/device_manager.cc
index e0db97adde8e3500b0c6266464ceca62a6cf36ca..1ffe38d8e1f4ce59aa819a5eaa46c75d5fded5b0 100644
--- a/paddle/fluid/platform/device/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 
 #if !defined(_WIN32)
 #include <dirent.h>
@@ -24,8 +24,7 @@
 #include <functional>
 #include <regex>
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 void Device::CreateStream(stream::Stream* stream,
                           const stream::Stream::Priority& priority,
@@ -76,23 +75,32 @@ void Device::StreamWaitEvent(const stream::Stream* stream,
   impl_->StreamWaitEvent(dev_id_, stream, event);
 }
 
-void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
+void Device::MemoryCopyH2D(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
+void Device::MemoryCopyD2H(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
+void Device::MemoryCopyD2D(void* dst,
+                           const void* src,
+                           size_t size,
                            const stream::Stream* stream) {
   impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
 }
 
-void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
-                           size_t size, const stream::Stream* stream) {
+void Device::MemoryCopyP2P(const Place& dst_place,
+                           void* dst,
+                           const void* src,
+                           size_t size,
+                           const stream::Stream* stream) {
   impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
 }
 
@@ -173,7 +181,7 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
   } else {
     LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
     PADDLE_THROW(
-        platform::errors::Fatal("Unregistered device type %s.", device_type));
+        phi::errors::Fatal("Unregistered device type %s.", device_type));
     return nullptr;
   }
 }
@@ -182,17 +190,21 @@ Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
   phi::AutoRDLock lock(&_global_device_manager_rw_lock);
 
   auto& dev_map = Instance().device_map_;
-  auto dev_type = PlaceHelper::GetDeviceType(place);
-  auto dev_id = PlaceHelper::GetDeviceId(place);
-  PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
-                    platform::errors::NotFound(
-                        "Unable to find Device with type %s.", dev_type));
+  auto dev_type = place.GetDeviceType();
+  auto dev_id = place.GetDeviceId();
+  PADDLE_ENFORCE_NE(
+      dev_map.find(dev_type),
+      dev_map.end(),
+      phi::errors::NotFound("Unable to find Device with type %s.", dev_type));
   auto& dev_vec = dev_map[dev_type];
   PADDLE_ENFORCE_LT(
-      dev_id, dev_vec.size(),
-      platform::errors::OutOfRange(
+      dev_id,
+      dev_vec.size(),
+      phi::errors::OutOfRange(
           "The visible devices count of type %s is %d, but dev_id is %d.",
-          dev_type, dev_vec.size(), dev_id));
+          dev_type,
+          dev_vec.size(),
+          dev_id));
   return dev_vec[dev_id].get();
 }
 
@@ -277,22 +289,22 @@ void DeviceManager::Finalize(const std::string& device_type) {
 }
 
 void DeviceManager::SynchronizeDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->SynchronizeDevice(device_id);
 }
 
 void DeviceManager::InitDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->InitDevice(device_id);
 }
 
 void DeviceManager::DeInitDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->DeInitDevice(device_id);
 }
@@ -304,8 +316,8 @@ void DeviceManager::SetDevice(const std::string& device_type,
 }
 
 void DeviceManager::SetDevice(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   DeviceManager::SetDevice(device_type, device_id);
 }
 
@@ -315,51 +327,52 @@ int DeviceManager::GetDevice(const std::string& device_type) {
 }
 
 size_t DeviceManager::GetMinChunkSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMinChunkSize(device_id);
 }
 
 size_t DeviceManager::GetMaxChunkSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMaxChunkSize(device_id);
 }
 
 size_t DeviceManager::GetMaxAllocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetMaxAllocSize(device_id);
 }
 
 size_t DeviceManager::GetInitAllocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetInitAllocSize(device_id);
 }
 
 size_t DeviceManager::GetReallocSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetReallocSize(device_id);
 }
 
 size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   return dev_impl->GetExtraPaddingSize(device_id);
 }
 
-void DeviceManager::MemoryStats(const Place& place, size_t* total,
+void DeviceManager::MemoryStats(const Place& place,
+                                size_t* total,
                                 size_t* free) {
-  auto device_type = PlaceHelper::GetDeviceType(place);
-  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto device_type = place.GetDeviceType();
+  auto device_id = place.GetDeviceId();
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
   dev_impl->MemoryStats(device_id, total, free);
 }
@@ -393,8 +406,8 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   } else {
     while ((ptr = readdir(dir)) != nullptr) {
       std::string filename(ptr->d_name);
-      if (std::regex_match(filename.begin(), filename.end(), results,
-                           express)) {
+      if (std::regex_match(
+              filename.begin(), filename.end(), results, express)) {
         libraries.push_back(library_dir + '/' + filename);
         VLOG(4) << "Found lib: " << libraries.back();
       }
@@ -405,6 +418,5 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   return libraries;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 #endif
diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/phi/backends/device_manager.h
similarity index 83%
rename from paddle/fluid/platform/device/device_manager.h
rename to paddle/phi/backends/device_manager.h
index d3aaafcddf7c40f2b2c8fe4ce9057a04b4d6f9a5..c0911a0f8d50c52697b748f3726faded5a428694 100644
--- a/paddle/fluid/platform/device/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -15,17 +15,16 @@
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 
-#include "paddle/fluid/platform/device/device_base.h"
-#include "paddle/fluid/platform/device/device_ext.h"
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/stream.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/device_base.h"
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
+#include "paddle/phi/common/place.h"
 
 #include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 class Device final {
  public:
   Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
@@ -33,8 +32,9 @@ class Device final {
   // Stream
   // ! Create an asynchronous stream
   void CreateStream(
-      stream::Stream* stream, const stream::Stream::Priority& priority =
-                                  stream::Stream::Priority::kNormal,
+      stream::Stream* stream,
+      const stream::Stream::Priority& priority =
+          stream::Stream::Priority::kNormal,
       const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
 
   // ! Destroys an asynchronous stream.
@@ -69,17 +69,26 @@ class Device final {
   void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
 
   // Memory
-  void MemoryCopyH2D(void* dst, const void* src, size_t size,
+  void MemoryCopyH2D(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyD2H(void* dst, const void* src, size_t size,
+  void MemoryCopyD2H(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyD2D(void* dst, const void* src, size_t size,
+  void MemoryCopyD2D(void* dst,
+                     const void* src,
+                     size_t size,
                      const stream::Stream* stream = nullptr);
 
-  void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
-                     size_t size, const stream::Stream* stream = nullptr);
+  void MemoryCopyP2P(const Place& dst_place,
+                     void* dst,
+                     const void* src,
+                     size_t size,
+                     const stream::Stream* stream = nullptr);
 
   void* MemoryAllocate(size_t size);
 
@@ -168,7 +177,8 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
-                          const std::string& dso_lib_path, void* dso_handle);
+                          const std::string& dso_lib_path,
+                          void* dso_handle);
 
 class Registrar {
  public:
@@ -180,7 +190,6 @@ class Registrar {
   void Touch() {}
 };
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 #endif
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index 75fc8fd9a3c6e4f4eed24f49ea4332aa2cfdf0a0..c81c66c69282fac3c8bbf4ef5e30c50c8592b1b0 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
-// Note(zhouwei): because lapack doesn't provide appropriate header file.
-// should expose API statement yourself.
+// Because lapack doesn't provide appropriate header file,
+// we should expose API statement yourself.
 
 // getrf_(For example)
 extern "C" void dgetrf_(
diff --git a/paddle/fluid/platform/device/event.cc b/paddle/phi/backends/event.cc
similarity index 84%
rename from paddle/fluid/platform/device/event.cc
rename to paddle/phi/backends/event.cc
index 6e6316ea16de020801a7afce6ad47f4b06eca022..a474536f865c16e9f808f49a2e24b26f102c75d1 100644
--- a/paddle/fluid/platform/device/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/event.h"
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/event.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 namespace event {
 
 event_t Event::raw_event() const { return event_; }
@@ -27,7 +26,7 @@ void Event::set_event(event_t event) { event_ = event; }
 
 Event::Event(const Place& place, event_t event)
     : place_(place),
-      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      device_(phi::DeviceManager::GetDeviceWithPlace(place)),
       event_(event),
       own_data_(false) {}
 
@@ -60,5 +59,4 @@ void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
 const Place& Event::GetPlace() const { return place_; }
 
 }  // namespace event
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/event.h b/paddle/phi/backends/event.h
similarity index 91%
rename from paddle/fluid/platform/device/event.h
rename to paddle/phi/backends/event.h
index 376d73eb66660fdcdc0b2412d5d5e1371145e634..0866adcf39afa6259ccc424d93eb086427b39679 100644
--- a/paddle/fluid/platform/device/event.h
+++ b/paddle/phi/backends/event.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class Device;
 
@@ -57,5 +57,4 @@ class Event {
 };
 }  // namespace event
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index d0787159e1e308e2aed9b5b914d37d8a56e5e68b..33daa2bba6b7d9fba4b0815a86351da2947efd92 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,6 +57,9 @@ using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
 // Forward declaration of cuBLAS types.
 using cublasHandle_t = struct cublasContext *;
 
+// Forward declaration of cuBLASLt types.
+using cublasLtHandle_t = struct cublasLtContext *;
+
 // Forward declaration of cuSOLVER types.
 using cusolverDnHandle_t = struct cusolverDnContext *;
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index dbcc1660c6472cdddaaa3bea72854f61370c19a0..a3b252598582bc212ba66f9c18ec52e035a29a68 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -171,6 +172,7 @@ struct GPUContext::Impl {
     InitStream();
     InitEigenDevice();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -183,6 +185,7 @@ struct GPUContext::Impl {
     InitGpuProperties();
     InitStream();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -212,6 +215,7 @@ struct GPUContext::Impl {
     }
 #endif
     DestroyInternalBlasHandle();
+    DestroyInternalBlasLtHandle();
     DestoryInternalStream();
   }
 
@@ -418,6 +422,25 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
+  void InitBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtCreate(&blaslt_handle_);
+#endif
+  }
+
+  void DestroyInternalBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtDestroy(blaslt_handle_);
+#endif
+  }
+
+  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
+
+  blasLtHandle_t GetBlasLtHandle() const {
+    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
+    return blaslt_handle_;
+  }
+
   void InitDNNHandle() {
     if (phi::dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
@@ -631,10 +654,17 @@ struct GPUContext::Impl {
   }
 
   void AddStreamCallback(const std::function<void()>& callback) const {
-    // TODO(wilber): Do we need ThreadPool?
-    auto* func = new std::function<void()>([this, callback] {
+    // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
+    // launch too
+    // many threads and result in thread oversubscription.
+    auto* callback_func = new std::function<void()>(std::move(callback));
+    auto* func = new std::function<void()>([this, callback_func] {
       std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
-      last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
+      VLOG(4) << "Stream callback";
+      last_future_ = std::async(std::launch::async, [callback_func]() {
+        std::unique_ptr<std::function<void()>> releaser(callback_func);
+        (*callback_func)();
+      });
     });
 
 #ifdef PADDLE_WITH_HIP
@@ -679,6 +709,7 @@ struct GPUContext::Impl {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
   solverHandle_t solver_handle_{nullptr};
   sparseHandle_t sparse_handle_{nullptr};
@@ -725,6 +756,10 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
+blasLtHandle_t GPUContext::cublaslt_handle() const {
+  return impl_->GetBlasLtHandle();
+}
+
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
 }
@@ -815,6 +850,10 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
+  impl_->SetBlasLtHandle(blaslt);
+}
+
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index b9d843982dc5ebb8312a4912ebfa96c73e22b6c5..3eb4360ad35382369681308b46050cc3e6e04ea0 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -93,6 +94,9 @@ class GPUContext : public DeviceContext {
   /*! \brief  Return cublas handle in the device context. */
   blasHandle_t cublas_handle() const;
 
+  /*! \brief  Return cublasLt handle in the device context. */
+  blasLtHandle_t cublaslt_handle() const;
+
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
 
@@ -193,6 +197,8 @@ class GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasLtHandle(blasLtHandle_t);
+
   void SetDnnHandle(dnnHandle_t);
 
   void SetSolverHandle(solverHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 0be24392e1b40c5955d01f5d4c1834a2f46ce29d..4a6b9d2fd87f13e4b63d3da8e7e98f77d18c69a0 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,6 +60,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
 
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/phi/backends/stream.cc
similarity index 84%
rename from paddle/fluid/platform/device/stream.cc
rename to paddle/phi/backends/stream.cc
index 7f867e5ee7737d45f26a1967a3112c7075843454..30939f31fcc3c80931cf627451f46cb620dc21df 100644
--- a/paddle/fluid/platform/device/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/stream.h"
-#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/phi/backends/stream.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/event.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/event.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 namespace stream {
 
 Stream::~Stream() { Destroy(); }
@@ -30,15 +29,16 @@ void Stream::set_stream(stream_t stream) { stream_ = stream; }
 // For compatiable
 Stream::Stream(const Place& place, stream_t stream)
     : place_(place),
-      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      device_(phi::DeviceManager::GetDeviceWithPlace(place)),
       stream_(stream),
       callback_manager_(new CallbackManager(this)),
       own_data_(false) {}
 
-bool Stream::Init(const Place& place, const Priority& priority,
+bool Stream::Init(const Place& place,
+                  const Priority& priority,
                   const Flag& flag) {
   place_ = place;
-  device_ = platform::DeviceManager::GetDeviceWithPlace(place);
+  device_ = phi::DeviceManager::GetDeviceWithPlace(place);
   DeviceGuard guard(place_);
   device_->CreateStream(this, priority, flag);
 
@@ -92,5 +92,4 @@ void Stream::Synchronize() const { device_->SynchronizeStream(this); }
 const Place& Stream::GetPlace() const { return place_; }
 
 }  // namespace stream
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/platform/device/stream.h b/paddle/phi/backends/stream.h
similarity index 87%
rename from paddle/fluid/platform/device/stream.h
rename to paddle/phi/backends/stream.h
index 25cf705ee0951847bfda84b336d3579403e8ab37..d1578c90ec1a97f51646dd206d48df3146c35cee 100644
--- a/paddle/fluid/platform/device/stream.h
+++ b/paddle/phi/backends/stream.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/callback_manager.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
 
-namespace paddle {
-namespace platform {
+namespace phi {
 
 class Device;
 
@@ -49,7 +49,8 @@ class Stream {
   ~Stream();
   const stream_t& raw_stream() const;
   void set_stream(stream_t stream);
-  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
+  bool Init(const Place& place,
+            const Priority& priority = Priority::kNormal,
             const Flag& flag = Flag::kDefaultFlag);
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
@@ -75,5 +76,4 @@ class Stream {
 };
 
 }  // namespace stream
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 85a1424ee34e04b50a077f5d8ac88d0a0d2fbe78..0947870dcd35a689d7ebfde5ddcab12c361358e9 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(phi_place SRCS place.cc)
+cc_library(scalar SRCS scalar.cc)
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index cf99bb8f19af0516ed5524c1d2af89777c1e1d0b..5f30ee4077b5c615d73a7446fff211a6c86d69b6 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -310,6 +310,10 @@ HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+HOSTDEVICE inline bfloat16(abs)(const bfloat16& a) {
+  return bfloat16(std::abs(static_cast<float>(a)));
+}
+
 inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
   os << static_cast<float>(a);
   return os;
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index d9dc103e48e737c1e88f6ae262566523494bff84..38239f0fa9dc1c2c2c5f67645745a1b0bf0281ac 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -82,7 +82,7 @@ inline size_t SizeOf(DataType data_type) {
   return 0;
 }
 
-#define PT_FOR_EACH_DATA_TYPE(_)    \
+#define PD_FOR_EACH_DATA_TYPE(_)    \
   _(bool, DataType::BOOL)           \
   _(int8_t, DataType::INT8)         \
   _(uint8_t, DataType::UINT8)       \
@@ -105,25 +105,25 @@ struct DataTypeToCppType;
 template <typename T>
 struct CppTypeToDataType;
 
-#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+#define PD_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
   template <>                                                \
   struct DataTypeToCppType<data_type> {                      \
     using type = cpp_type;                                   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCppType)
 
-#undef PT_SPECIALIZE_DataTypeToCppType
+#undef PD_SPECIALIZE_DataTypeToCppType
 
-#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+#define PD_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
   template <>                                                \
   struct CppTypeToDataType<cpp_type> {                       \
     constexpr static DataType Type() { return data_type; }   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_CppTypeToDataType)
 
-#undef PT_SPECIALIZE_CppTypeToDataType
+#undef PD_SPECIALIZE_CppTypeToDataType
 
 inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   switch (dtype) {
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5cd55c1e88bed6f805a72cff92024d9dc219a1a2
--- /dev/null
+++ b/paddle/phi/common/scalar.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/scalar.h"
+
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+// NOTE(xiongkun): why we put definition here?
+// test_custom_op can't include enforce.h, because enforce.h includes gflags.
+// so we decouple the include dependence of enforce.h by link.
+void ThrowTensorConvertError(int num) {
+  PADDLE_ENFORCE_EQ(num,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The Scalar only supports Tensor with 1 element, but "
+                        "now Tensor has `%d` elements",
+                        num));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 72cef89d300c8d60811bde7cf667275b37fedc6f..5134f4eb72639650a0bde34f2abbb0e05ced13c7 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/api/include/tensor.h"
+
 namespace paddle {
 namespace experimental {
 
+void ThrowTensorConvertError(int);
+
 template <typename T>
 class ScalarBase {
  public:
@@ -104,11 +107,7 @@ class ScalarBase {
   // The Tensor must have one dim
   ScalarBase(const T& tensor) : dtype_(tensor.dtype()) {  // NOLINT
     is_from_tensor_ = true;
-    PD_CHECK(
-        tensor.numel() == 1,
-        "The Scalar only supports Tensor with 1 element, but now Tensor has `",
-        tensor.numel(),
-        "` element.");
+    ThrowTensorConvertError(tensor.numel());
     switch (dtype_) {
       case DataType::FLOAT32:
         data_.f32 = tensor.template data<float>()[0];
@@ -156,6 +155,8 @@ class ScalarBase {
     CopyScalar(other, this);
   }
 
+  // NOTE(xiongkun): some op need to judge the dtype of the Scalar, we expose a
+  // interface.
   bool FromTensor() const { return is_from_tensor_; }
 
   void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; }
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 8ffacbb39bb249c57fb5c9ef1462d03747356f96..b4a6b54d0fe3a96eea831a439f1555e14c367fa7 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
 cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
-cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index f625d57df2ef2dc2f9505853dc5e07e5d9e0022e..25b80279ecf10619d97b8800b24ab5353c79745d 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <tuple>
 
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
@@ -95,6 +96,13 @@ class ArgumentMappingContext {
   // use this function to mark it comes from InferShapeArgumentMappingContext
   // and will be used in infershape
   virtual bool IsForInferShape() const = 0;
+
+  // NOTE(paddle-dev): [ Why do we export this interface? ]
+  // In old Fluid framework, some operators' Attribute can be a Tensor or
+  // TensorList. In this case, the InferShape logic will be different
+  // under CompileTime and RuntimeTime. So we export this interface to
+  // handle it conveniently. See "gaussian_random_sig.cc" for details.
+  virtual bool IsRuntime() const { return true; }
 };
 
 }  // namespace phi
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index b85db07bd9dfa0d798304aac6bd86089a9f0b4c0..67245f1da5a6b65cdf81d2cd329d7d6b3828852e 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/op_utils.h"
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/phi/backends/device_manager.h"
 #endif
 
 namespace phi {
@@ -83,9 +83,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
       if (!device_type.empty()) {
         return phi::CustomPlace(
             device_type,
-            set_device_id
-                ? paddle::platform::DeviceManager::GetDevice(device_type)
-                : 0);
+            set_device_id ? phi::DeviceManager::GetDevice(device_type) : 0);
       }
 #endif
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index bbf634b4b09b90a086505bc173b588d7da2e9668..b1da573c49f2f20c6b25beae189fe5952efd3cef 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -40,15 +40,28 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
 const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "flatten",
                                                            "flatten_grad",
+                                                           "isinf",
+                                                           "isnan",
+                                                           "isfinite",
                                                            "matmul",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "max",
+                                                           "min",
+                                                           "any",
+                                                           "all",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
+                                                           "expand_as",
                                                            "expand_grad",
-                                                           "sum"});
+                                                           "expand_as_grad",
+                                                           "sum",
+                                                           "one_hot",
+                                                           "sum_grad",
+                                                           "top_k",
+                                                           "top_k_grad"});
 
 class DefaultKernelSignatureMap {
  public:
@@ -166,7 +179,7 @@ struct ArgumentMappingFnRegistrar {
 };
 
 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
       PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
       "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
@@ -174,7 +187,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_ai_name_ns_check_##op_type,                              \
       "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
@@ -182,7 +195,7 @@ struct ArgumentMappingFnRegistrar {
       TouchBaseKernelNameSymbol_##op_type()
 
 #define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
       PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
       "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
@@ -190,7 +203,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
       PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
       "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index 58f9e1c623e81b4f2877099d1cdc2a8fe2e18b9e..48778bb38e5487506f4b402176fff26cbe485de7 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -16,12 +16,29 @@
 
 namespace phi {
 
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
-  auto& kernel_info_map = custom_kernel_map.GetMap();
-  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+void CustomKernelMap::RegisterCustomKernel(const std::string& name,
+                                           const KernelKey& key,
+                                           const Kernel& kernel) {
+  PADDLE_ENFORCE_EQ(kernels_[name].find(key),
+                    kernels_[name].end(),
+                    phi::errors::AlreadyExists(
+                        "The custom kernel [%s:%s] has been already existed in "
+                        "CustomKernelMap, please check if any duplicate kernel "
+                        "info in your lib(s) before load again.",
+                        name,
+                        key));
+  kernels_[name][key] = kernel;
+}
+
+void CustomKernelMap::RegisterCustomKernels() {
+  VLOG(3) << "Size of custom_kernel_map: " << kernels_.size();
 
+  if (kernels_.size() <= 0) {
+    LOG(INFO) << "No custom kernel info found in loaded lib(s).";
+    return;
+  }
   auto& kernels = KernelFactory::Instance().kernels();
-  for (auto& pair : kernel_info_map) {
+  for (auto& pair : kernels_) {
     PADDLE_ENFORCE_NE(
         kernels.find(pair.first),
         kernels.end(),
@@ -33,8 +50,8 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
       PADDLE_ENFORCE_EQ(
           kernels[pair.first].find(info_pair.first),
           kernels[pair.first].end(),
-          phi::errors::InvalidArgument(
-              "The operator <%s>'s kernel: %s has been already existed "
+          phi::errors::AlreadyExists(
+              "The kernel [%s:%s] has been already existed "
               "in Paddle, please contribute PR if it is necessary "
               "to optimize the kernel code. Custom kernel does NOT support "
               "to replace existing kernel in Paddle.",
@@ -43,24 +60,15 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
 
       kernels[pair.first][info_pair.first] = info_pair.second;
 
-      VLOG(3) << "Successed in registering operator <" << pair.first
-              << ">'s kernel: " << info_pair.first
-              << " to Paddle. It will be used like native ones.";
+      VLOG(3) << "Successed in registering kernel [" << pair.first << ":"
+              << info_pair.first
+              << "] to Paddle. It will be used like native ones.";
     }
   }
+  LOG(INFO) << "Successed in loading " << kernels_.size()
+            << " custom kernel(s) from loaded lib(s), will be "
+            << "used like native ones.";
+  kernels_.clear();
 }
 
 }  // namespace phi
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global CustomKernelMap.
-phi::CustomKernelMap& PD_GetCustomKernelMap() {
-  return phi::CustomKernelMap::Instance();
-}
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
index 20ae2b7bb7360ab6878617234784157584e01858..5ba14de6a6131c1b2e2fefb5d4eca306d0c4e0b3 100644
--- a/paddle/phi/core/custom_kernel.h
+++ b/paddle/phi/core/custom_kernel.h
@@ -29,6 +29,12 @@ class CustomKernelMap {
     return g_custom_kernel_info_map;
   }
 
+  void RegisterCustomKernel(const std::string& kernel_name,
+                            const KernelKey& kernel_key,
+                            const Kernel& kernel);
+
+  void RegisterCustomKernels();
+
   KernelNameMap& Kernels() { return kernels_; }
 
   const KernelNameMap& GetMap() const { return kernels_; }
@@ -40,10 +46,4 @@ class CustomKernelMap {
   KernelNameMap kernels_;
 };
 
-/**
- * Note:
- * Used to register custom kernels to KernelFactory.
- */
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
-
 }  // namespace phi
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index a5775db74382c1aeda95a4351842444b5ad1e47e..9c351ce9063ecf56b76a2eb351d1d2d23f13a83b 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -86,10 +86,10 @@ class InferMetaContext {
   paddle::SmallVector<std::pair<int, int>> output_range_;
 };
 
-#define PT_INFER_META(...) \
+#define PD_INFER_META(...) \
   ::phi::InferMetaFnImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Call
 
-#define PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                                  \
   struct InferMetaFnCallHelper<attr_type, Tail...> {                           \
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> \
@@ -175,24 +175,24 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   // TODO(chenweihang): support other attr type later
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<int64_t>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<std::string>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
 
   // TODO(chenweihang): support vector<MetaTensor> input later
 
@@ -304,11 +304,11 @@ struct InferMetaFnRegistrar {
 };
 
 #define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
       PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
       "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
-          #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
+          #kernel_name_prefix, PD_INFER_META(variadic_infer_meta_fn))
 
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 57e2db60c24caea8cbac323d9c47bdb53acc8a8c..213ac47d30bfdd28541bd1b9cb24bf2053b1c939 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -82,12 +82,11 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
-    std::vector<TensorType> v;
+  std::vector<const TensorType*> InputsBetween(size_t start, size_t end) {
+    std::vector<const TensorType*> v;
     for (size_t i = start; i < end; ++i) {
-      auto t = static_cast<const TensorType*>(inputs_.at(i));
-      v.emplace_back(*t);
-      inputs_[i] = nullptr;
+      auto* t = static_cast<const TensorType*>(inputs_.at(i));
+      v.emplace_back(t);
     }
     return v;
   }
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 2b04d173af06990bbbec91d9a3c6e0929dd814cc..d9ed68593cd610790ee4a0015069ac5a8cfea61b 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -87,8 +87,8 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-      } else if (arg_type ==
-                 std::type_index(typeid(const std::vector<DenseTensor>&))) {
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<const DenseTensor*>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
@@ -210,7 +210,8 @@ struct KernelRegistrar {
     if (reg_type == RegType::INNER) {
       KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
     } else {
-      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+      CustomKernelMap::Instance().RegisterCustomKernel(
+          kernel_name, kernel_key, kernel);
     }
   }
 };
@@ -228,13 +229,13 @@ struct KernelRegistrar {
  *   http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement
  *   http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644
  */
-#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N()))
-#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
-#define _PT_ARG_N_EXPAND(                                                     \
+#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N()))
+#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__)
+#define _PD_ARG_N_EXPAND(                                                     \
     _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
   N
-#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
-#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args
+#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 /** PD_REGISTER_KERNEL
  *
@@ -256,10 +257,10 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_KERNEL(                                               \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
       PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_KERNEL must be called in global namespace.");           \
-  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
+  PD_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
                                     kernel_name,                           \
                                     backend,                               \
                                     context,                               \
@@ -270,19 +271,19 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_KERNEL_REGISTRAR_INIT(                                                 \
+  PD_KERNEL_REGISTRAR_INIT(                                                 \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__);                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 /**
@@ -299,119 +300,119 @@ struct KernelRegistrar {
  */
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+  PD_EXPAND(PD_KERNEL_REGISTRAR_INIT(                                       \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__));                                                        \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
-  _PT_KERNEL_INSTANTIATION(                                            \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PD_KERNEL_INSTANTIATION(                                            \
+      PD_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                             \
   (meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION_1(              \
+#define _PD_KERNEL_INSTANTIATION_1(              \
     meta_kernel_fn, backend, context, cpp_dtype) \
   template decltype(                             \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
-#define _PT_KERNEL_INSTANTIATION_2(                                           \
+#define _PD_KERNEL_INSTANTIATION_2(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(                                           \
+#define _PD_KERNEL_INSTANTIATION_3(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(                                           \
+#define _PD_KERNEL_INSTANTIATION_4(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(                                           \
+#define _PD_KERNEL_INSTANTIATION_5(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(                                           \
+#define _PD_KERNEL_INSTANTIATION_6(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(                                           \
+#define _PD_KERNEL_INSTANTIATION_7(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(                                           \
+#define _PD_KERNEL_INSTANTIATION_8(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(                                           \
+#define _PD_KERNEL_INSTANTIATION_9(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(                                          \
+#define _PD_KERNEL_INSTANTIATION_10(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(                                          \
+#define _PD_KERNEL_INSTANTIATION_11(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(                                          \
+#define _PD_KERNEL_INSTANTIATION_12(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(                                          \
+#define _PD_KERNEL_INSTANTIATION_13(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(                                          \
+#define _PD_KERNEL_INSTANTIATION_14(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(                                          \
+#define _PD_KERNEL_INSTANTIATION_15(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+#define PD_KERNEL_REGISTRAR_INIT(reg_type,                   \
                                  kernel_name,                \
                                  backend,                    \
                                  context,                    \
@@ -419,7 +420,7 @@ struct KernelRegistrar {
                                  args_def_fn,                \
                                  meta_kernel_fn,             \
                                  ...)                        \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT(PD_NARGS(__VA_ARGS__), \
                                       reg_type,              \
                                       kernel_name,           \
                                       backend,               \
@@ -433,7 +434,7 @@ struct KernelRegistrar {
 
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
-#define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+#define _PD_KERNEL_REGISTRAR_INIT(N,                       \
                                   reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
@@ -442,20 +443,20 @@ struct KernelRegistrar {
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
-  PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+  PD_EXPAND(PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \
     reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
     context,                                               \
     layout,                                                \
-    PT_ID,                                                 \
+    PD_ID,                                                 \
     args_def_fn,                                           \
     meta_kernel_fn,                                        \
     __VA_ARGS__))
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -464,7 +465,7 @@ struct KernelRegistrar {
                                     args_def_fn,                              \
                                     meta_kernel_fn,                           \
                                     cpp_dtype)                                \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -474,10 +475,10 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -487,7 +488,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -497,18 +498,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -518,7 +519,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -528,18 +529,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -549,7 +550,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -559,18 +560,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -580,7 +581,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -590,18 +591,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -611,7 +612,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -621,18 +622,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -642,7 +643,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -652,18 +653,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -673,7 +674,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -683,18 +684,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -704,7 +705,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -714,18 +715,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -735,7 +736,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -745,18 +746,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -766,7 +767,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -776,18 +777,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -797,7 +798,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -807,18 +808,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -828,7 +829,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -838,18 +839,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -859,7 +860,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -869,18 +870,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -890,7 +891,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -900,14 +901,14 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
@@ -924,7 +925,7 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_GENERAL_KERNEL(                                         \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
   __PD_REGISTER_GENERAL_KERNEL(                                              \
@@ -934,7 +935,7 @@ struct KernelRegistrar {
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -943,18 +944,18 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -963,13 +964,13 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
@@ -979,7 +980,7 @@ struct KernelRegistrar {
  * to avoid being removed by linker
  */
 #define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index b582375155a1878c52fd8fe9fb13f6e715df7067..2cc82772cf8aa09c2a67d5329dda5adfe01f21bb 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -30,14 +30,15 @@
 
 namespace phi {
 
-#define PT_KERNEL(...) \
+// PD_KERNEL has been used by custom op api
+#define PHI_KERNEL(...) \
   ::phi::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
 
-#define PT_VARIADIC_KERNEL(...)                                      \
+#define PHI_VARIADIC_KERNEL(...)                                     \
   reinterpret_cast<void*>(&::phi::KernelImpl<decltype(&__VA_ARGS__), \
                                              &__VA_ARGS__>::VariadicCompute)
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
   template <typename... Tail>                                                \
   struct KernelCallHelper<const dev_ctx&, Tail...> {                         \
     template <int dev_ctx_idx,                                               \
@@ -60,7 +61,7 @@ namespace phi {
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
   template <typename... Tail>                                           \
   struct KernelCallHelper<const tensor_type&, Tail...> {                \
     template <int dev_ctx_idx,                                          \
@@ -81,7 +82,7 @@ namespace phi {
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
   template <typename... Tail>                                              \
   struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
     template <int dev_ctx_idx,                                             \
@@ -102,29 +103,29 @@ namespace phi {
     }                                                                      \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
-  template <typename... Tail>                                              \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
-    template <int dev_ctx_idx,                                             \
-              int in_idx,                                                  \
-              int attr_idx,                                                \
-              int out_idx,                                                 \
-              typename... PreviousArgs>                                    \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
-      static_assert(attr_idx == 0,                                         \
-                    "Kernel's Input should appear before Attributes.");    \
-      static_assert(out_idx == 0,                                          \
-                    "Kernel's Input should appear before Outputs.");       \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
-      std::vector<tensor_type> arg = std::move(                            \
-          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
-      KernelCallHelper<Tail...>::                                          \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
-              ctx, pargs..., arg);                                         \
-    }                                                                      \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
+  template <typename... Tail>                                                \
+  struct KernelCallHelper<const std::vector<const tensor_type*>&, Tail...> { \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {        \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);           \
+      std::vector<const tensor_type*> arg = std::move(                       \
+          ctx->InputsBetween<tensor_type>(range.first, range.second));       \
+      KernelCallHelper<Tail...>::                                            \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                             \
   struct KernelCallHelper<attr_type, Tail...> {                           \
     template <int dev_ctx_idx,                                            \
@@ -142,7 +143,7 @@ namespace phi {
     }                                                                     \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
   template <typename... Tail>                                            \
   struct KernelCallHelper<tensor_type*, Tail...> {                       \
     template <int dev_ctx_idx,                                           \
@@ -159,7 +160,7 @@ namespace phi {
     }                                                                    \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
   template <typename... Tail>                                                 \
   struct KernelCallHelper<std::vector<tensor_type*>, Tail...> {               \
     template <int dev_ctx_idx,                                                \
@@ -204,65 +205,66 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   /* DeviceContext Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
 #endif
 
   /* Input Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
 
   /* Attribute Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<Scalar>&);
 
   /* Output Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 97c5466e1deea891419bd6659faa963dfedab3a5..8049d027a77b808c03636e5ca5a61bc61712652b 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -26,19 +26,19 @@ namespace phi {
   classname& operator=(classname&&) = delete
 #endif
 
-#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
-  _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+#define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
 
-#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
+#define _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
 #ifdef __COUNTER__
-#define PT_ID __COUNTER__
+#define PD_ID __COUNTER__
 #else
-#define PT_ID __LINE__
+#define PD_ID __LINE__
 #endif
 
 #if defined(_WIN32)
@@ -48,9 +48,9 @@ namespace phi {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
-#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
-#define PT_CONCATENATE2(arg1, arg2) arg1##arg2
-#define PT_EXPAND(x) x
+#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
+#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
+#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PD_EXPAND(x) x
 
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 2aadce4feda96623553e8583f926af38458f8f9e..bcbb1a4835b9d0397f6e85b7c44311bb9fe57209 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -72,6 +72,10 @@ void MetaTensor::set_layout(DataLayout layout) {
 }
 
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
+  if (meta_tensor.lod().size() == 0) {
+    // no need share
+    return;
+  }
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->lod =
         meta_tensor.lod();
@@ -98,13 +102,9 @@ const LoD& MetaTensor::lod() const {
 }
 
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
-  if (phi::DenseTensor::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    set_layout(meta_tensor.layout());
-    share_lod(meta_tensor);
-  } else if (phi::SelectedRows::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
+  if (phi::DenseTensor::classof(tensor_) ||
+      phi::SelectedRows::classof(tensor_)) {
+    share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
     set_layout(meta_tensor.layout());
     share_lod(meta_tensor);
@@ -114,4 +114,29 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
+TensorBase* MetaTensor::tensor() const { return tensor_; }
+
+void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
+  bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
+  bool is_selected_rows = phi::SelectedRows::classof(tensor_);
+  if (is_dense_tensor || is_selected_rows) {
+    set_dims(meta_tensor.dims());
+    if (is_selected_rows) {
+      const auto in_tensor_base = meta_tensor.tensor();
+      PADDLE_ENFORCE_EQ(
+          phi::SelectedRows::classof(in_tensor_base),
+          true,
+          errors::InvalidArgument("The input MetaTensor is SelectedRows, but "
+                                  "the output MetaTensor is not this type."));
+      auto* selected_rows_out = static_cast<SelectedRows*>(tensor_);
+      auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base);
+      selected_rows_out->set_rows(selected_rows_in->rows());
+      selected_rows_out->set_height(selected_rows_in->height());
+    }
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Unsupported sharing dims for `%s`.", tensor_->type_info().name()));
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 1a32019a190496804c0ef4c64f78f687b8af7577..10c3a7c1a3de376d21805a12ff0b2c98ab4fcbd3 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -26,11 +26,13 @@ namespace phi {
 // TODO(chenweihang): add other flags if needed
 struct MetaConfig {
   bool is_runtime{true};
-
+  bool is_run_mkldnn_kernel{false};
   MetaConfig() = default;
 
   // supporting implicit construction is easier to use
-  MetaConfig(bool is_runtime) : is_runtime(is_runtime) {}  // NOLINT
+  MetaConfig(bool is_runtime, bool is_run_mkldnn_kernel)
+      : is_runtime(is_runtime),
+        is_run_mkldnn_kernel(is_run_mkldnn_kernel) {}  // NOLINT
 };
 
 class MetaTensor {
@@ -60,12 +62,13 @@ class MetaTensor {
 
   virtual void share_lod(const MetaTensor& meta_tensor);
   virtual void share_meta(const MetaTensor& meta_tensor);
+  virtual void share_dims(const MetaTensor& meta_tensor);
 
  private:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
-
+  TensorBase* tensor() const;
   TensorBase* tensor_;
 };
 
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index 0dd5d543414fee444ee28994fcfd78fcdeee9e18..ca3290f33e61eb730d3f17a0b8cc72cbf1c0db58 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
   void* AllocateFrom(Allocator* allocator,
                      DataType dtype,
                      size_t requested_size = 0) override;
+  void set_dims(const DDim& dims) { this->dims_ = dims; }
 
  private:
   // save the indices of non zero elements in original dense tensor
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 7d403fee94300e9517fcc517f4d088470d772e35..37d1a234b5767a3873bda6b41e6e410df1c452af 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -64,6 +64,16 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad) {
+  const auto& dtype = out_grad.dtype();
+  x_grad->set_dims(x.dims());
+  x_grad->share_lod(x);
+  x_grad->set_dtype(dtype);
+}
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -93,6 +103,12 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x,
   }
 }
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) {
+  if (dx) {
+    dx->share_meta(x);
+  }
+}
+
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
@@ -102,7 +118,84 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
       dout.dims(),
       errors::InvalidArgument(
           "Input(Out) and its gradients should have the same shape."));
+
   dx->share_meta(dout);
 }
 
+void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
+                                   const MetaTensor& mask,
+                                   const MetaTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void PoolGradInferMeta(const MetaTensor& x,
+                       const MetaTensor& out,
+                       const MetaTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool ceil_mode,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx) {
+  dx->share_meta(x);
+}
+
+void ScatterGradInferMeta(const MetaTensor& index,
+                          const MetaTensor& updates,
+                          const MetaTensor& out_grad,
+                          bool overwrite,
+                          MetaTensor* x_grad,
+                          MetaTensor* updates_grad) {
+  const auto& dtype = out_grad.dtype();
+  if (updates_grad) {
+    updates_grad->set_dims(updates.dims());
+    updates_grad->set_dtype(dtype);
+  }
+
+  if (x_grad) {
+    x_grad->set_dims(out_grad.dims());
+    x_grad->set_dtype(dtype);
+  }
+}
+
+void ScatterNdAddGradInferMeta(const MetaTensor& index,
+                               const MetaTensor& updates,
+                               const MetaTensor& out_grad,
+                               MetaTensor* x_grad,
+                               MetaTensor* updates_grad) {
+  const auto& dtype = out_grad.dtype();
+  if (updates_grad) {
+    updates_grad->set_dims(updates.dims());
+    updates_grad->set_dtype(dtype);
+  }
+
+  if (x_grad) {
+    x_grad->set_dims(out_grad.dims());
+    x_grad->set_dtype(dtype);
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c7090ed664b286e5a8d2c8e327f3c1ea37a71f04..06ee5a205d7b0f2f842e1b9b4b8fad8948168b64 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <tuple>
 
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
@@ -29,6 +30,11 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         MetaTensor* dweight,
                                         MetaTensor* dbias);
 
+void GatherNdGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& out_grad,
+                           MetaTensor* x_grad);
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -41,8 +47,59 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x,
                                  MetaTensor* dy,
                                  MetaTensor* dz);
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx);
+
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
                                 MetaTensor* dx);
+
+void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
+                                   const MetaTensor& mask,
+                                   const MetaTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   MetaTensor* dx);
+
+void PsroiPoolGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& rois,
+                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& dout,
+                            int pooled_height,
+                            int pooled_width,
+                            int output_channels,
+                            float spatial_scale,
+                            MetaTensor* dx);
+
+void PoolGradInferMeta(const MetaTensor& x,
+                       const MetaTensor& out,
+                       const MetaTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool ceil_mode,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       MetaTensor* dx);
+
+void ScatterGradInferMeta(const MetaTensor& index,
+                          const MetaTensor& updates,
+                          const MetaTensor& out_grad,
+                          bool overwrite,
+                          MetaTensor* x_grad,
+                          MetaTensor* updates_grad);
+
+void ScatterNdAddGradInferMeta(const MetaTensor& index,
+                               const MetaTensor& updates,
+                               const MetaTensor& out_grad,
+                               MetaTensor* x_grad,
+                               MetaTensor* updates_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 675e68af74339b508f589a55a9c3cf3aed37cecb..ff2cf81a904e0a7a47b7ca44fdc3918cbdac902c 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -13,10 +13,325 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/binary.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
+namespace detail {
+
+static void BinarySameInputDimsCheck(const MetaTensor& x,
+                                     const MetaTensor& y,
+                                     MetaConfig config) {
+  auto input_dim = x.dims();
+  auto other_dim = y.dims();
+  PADDLE_ENFORCE_EQ(input_dim.size(),
+                    other_dim.size(),
+                    phi::errors::PreconditionNotMet(
+                        "Input(Input) and Input(Other) must have the same "
+                        "dimension size."));
+  int n = input_dim.size();
+  bool is_runtime = config.is_runtime;
+  for (int i = 0; i < n; i++) {
+    if (is_runtime) {
+      PADDLE_ENFORCE_EQ(input_dim[i],
+                        other_dim[i],
+                        phi::errors::PreconditionNotMet(
+                            "The value at dim %d of Input(Input) is not "
+                            "equal to the Input(Other): %ld != %ld.",
+                            i,
+                            input_dim[i],
+                            other_dim[i]));
+    } else {
+      if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
+        PADDLE_ENFORCE_EQ(input_dim[i],
+                          other_dim[i],
+                          phi::errors::PreconditionNotMet(
+                              "The value at dim %d of Input(Input) is not "
+                              "equal to the Input(Other): %ld != %ld.",
+                              i,
+                              input_dim[i],
+                              other_dim[i]));
+      }
+    }
+  }
+}
+
+}  // namespace detail
+
+void AllValueCompareInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              MetaTensor* out,
+                              MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  out->share_meta(x);
+}
+
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  int rank = input_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        label_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(input_dims,
+                      label_dims,
+                      phi::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same "
+                          "shape. But received: the shape of Input(X) is "
+                          "[%s], the shape of Input(Label) is [%s].",
+                          input_dims,
+                          label_dims));
+  }
+
+  out->set_dims(input_dims);
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out) {
+  auto input_dim = x.dims();
+
+  PADDLE_ENFORCE_GE(minlength,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The minlength should be greater than or equal to 0."
+                        "But received minlength is %d",
+                        minlength));
+
+  PADDLE_ENFORCE_EQ(
+      input_dim.size(),
+      1,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
+                                   "But the dimension of Input(X) is [%d]",
+                                   input_dim.size()));
+
+  if (weights.is_initialized()) {
+    auto weights_dim = weights->dims();
+    PADDLE_ENFORCE_EQ(weights_dim.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The 'shape' of Input(Weights) must be 1-D tensor."
+                          "But the dimension of Input(Weights) is [%d]",
+                          weights_dim.size()));
+
+    PADDLE_ENFORCE_EQ(
+        weights_dim[0],
+        input_dim[0],
+        phi::errors::InvalidArgument(
+            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+            "Input(X)."
+            "But received: the 'shape' of Input(Weights) is [%s],"
+            "the 'shape' of Input(X) is [%s]",
+            weights_dim,
+            input_dim));
+  }
+  out->set_dims(phi::make_ddim({-1}));
+  if (weights.is_initialized()) {
+    out->set_dtype(weights->dtype());
+  } else {
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input Y must greater or equal to 2"));
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "the rank of input X must greater or equal to 2"));
+  PADDLE_ENFORCE_EQ(
+      y_dims[y_dims_n - 1],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("input Matrix Y should be square matrix,"
+                                   "But Got last shape of %ld x %ld",
+                                   y_dims[y_dims_n - 1],
+                                   y_dims[y_dims_n - 2]));
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_dims_n - 2],
+      y_dims[y_dims_n - 2],
+      phi::errors::InvalidArgument("the first dim of Matrix X must be equal to "
+                                   "the fisrt dim of Matrix Y,"
+                                   "But Got %ld and %ld",
+                                   x_dims[x_dims_n - 2],
+                                   y_dims[y_dims_n - 2]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_broadcast_dims({expand_batch_portion});
+  x_broadcast_dims.insert(x_broadcast_dims.end(),
+                          {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]});
+
+  // dim of 'out' is the same with 'X' after broadcast
+  out->set_dims(phi::make_ddim(x_broadcast_dims));
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+
+  if (dim_x == dim_y) {
+    out->share_meta(x);
+  } else {
+    int max_dim = std::max(dim_x.size(), dim_y.size());
+    int axis = std::abs(dim_x.size() - dim_y.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    funcs::GetBroadcastDimsArrays(dim_x,
+                                  dim_y,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
+
+    out->set_dims(make_ddim(out_dims_array));
+    out->share_lod(x);
+  }
+
+  out->set_dtype(DataType::BOOL);
+}
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  PADDLE_ENFORCE_GE(
+      dim_x.size(),
+      dim_y.size(),
+      errors::InvalidArgument(
+          "The size of dim_y should not be greater than dim_x's."));
+  out->share_lod(x);
+  out->set_dims(make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out) {
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  auto dim = axis;
+
+  bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
+  PADDLE_ENFORCE_EQ(
+      dims_match,
+      true,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
+                                   "the 'shape' of Input(Y). But received "
+                                   "Input(X).dimensions = [%s], "
+                                   "Input(Y).dimensions = [%s]",
+                                   x_dim,
+                                   y_dim));
+
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < x_dim.size() && dim >= (0 - x_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            x_dim.size(),
+            x_dim.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += x_dim.size();
+    }
+    PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Input(X/Y).dims()[dim] should be equal to 3."
+                          "But received Input(X/Y).dims()[dim] = %d.",
+                          x_dim[dim]));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) has not been initialized properly. The "
+                        "shape of Input(X) = [%s].",
+                        x_dims));
+  PADDLE_ENFORCE_NE(phi::product(y_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) has not been initialized properly. The "
+                        "shape of Input(Y) = [%s].",
+                        y_dims));
+  out->set_dims({1});
+  out->set_dtype(x.dtype());
+}
 
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_dims = x.dims();
@@ -60,79 +375,6 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
-void MatmulInferMeta(const MetaTensor& x,
-                     const MetaTensor& y,
-                     bool trans_x,
-                     bool trans_y,
-                     MetaTensor* out) {
-  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
-  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
-  auto ndims_x = dims_x.size();
-  auto ndims_y = dims_y.size();
-  PADDLE_ENFORCE_GT(ndims_x,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(x) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_GT(ndims_y,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(y) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-
-  bool x_broadcasted = false, y_broadcasted = false;
-  if (ndims_x == 1) {
-    dims_x.insert(dims_x.begin(), 1);
-    ndims_x = 2;
-    x_broadcasted = true;
-  }
-
-  if (ndims_y == 1) {
-    dims_y.push_back(1);
-    ndims_y = 2;
-    y_broadcasted = true;
-  }
-
-  size_t M, N;
-  if (trans_x) {
-    M = dims_x[ndims_x - 1];
-  } else {
-    M = dims_x[ndims_x - 2];
-  }
-  if (trans_y) {
-    N = dims_y[ndims_y - 2];
-  } else {
-    N = dims_y[ndims_y - 1];
-  }
-
-  std::vector<int64_t> new_dims;
-  if (ndims_x > ndims_y) {
-    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
-  } else if (ndims_x < ndims_y) {
-    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
-  } else {
-    new_dims.reserve(ndims_x);
-    for (size_t i = 0; i < ndims_x - 2; ++i) {
-      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
-    }
-  }
-  if (!x_broadcasted) {
-    new_dims.push_back(M);
-  }
-  if (!y_broadcasted) {
-    new_dims.push_back(N);
-  }
-  if (x_broadcasted && y_broadcasted) {
-    new_dims.push_back(1);
-  }
-
-  auto ddim_out = phi::make_ddim(new_dims);
-
-  out->set_dims(ddim_out);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-}
-
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out) {
@@ -189,6 +431,52 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto x_dims_size = x_dims.size();
+  auto index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      x_dims_size,
+      phi::errors::InvalidArgument(
+          "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1"));
+
+  std::vector<int64_t> result_dims;
+  // The result dims is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    result_dims.emplace_back(index_dims[i]);
+  }
+  for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
+    result_dims.emplace_back(x_dims[i]);
+  }
+
+  out->set_dims(phi::make_ddim(result_dims));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
+
 void HuberLossInferMeta(const MetaTensor& input,
                         const MetaTensor& label,
                         float delta,
@@ -260,92 +548,271 @@ void IndexSampleInferMeta(const MetaTensor& x,
   out->set_dims(index_dims);
   out->share_lod(y);
 }
-void CrossInferMeta(const MetaTensor& x,
-                    const MetaTensor& y,
-                    int axis,
-                    MetaTensor* out) {
-  auto x_dim = x.dims();
-  auto y_dim = y.dims();
-  auto dim = axis;
 
-  bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
-  PADDLE_ENFORCE_EQ(
-      dims_match,
-      true,
-      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
-                                   "the 'shape' of Input(Y). But received "
-                                   "Input(X).dimensions = [%s], "
-                                   "Input(Y).dimensions = [%s]",
-                                   x_dim,
-                                   y_dim));
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto pred_dims = input.dims();
+  auto label_dims = label.dims();
 
-  if (dim != DDim::kMaxRank) {
+  if (config.is_runtime ||
+      (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
     PADDLE_ENFORCE_EQ(
-        dim < x_dim.size() && dim >= (0 - x_dim.size()),
-        true,
-        phi::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            x_dim.size(),
-            x_dim.size() - 1,
-            dim));
-    if (dim < 0) {
-      dim += x_dim.size();
-    }
-    PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
-                      true,
+        pred_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The dimensions of Input(Predicted) must be equal to the"
+            "dimensions of Input(Labels), but received dimensions of "
+            "Input(Predicted)"
+            "is [%s], received dimensions of Input(Labels) is [%s].",
+            pred_dims,
+            label_dims));
+  }
+  PADDLE_ENFORCE_EQ(pred_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(Predicted) must be 2,"
+                        "But received dimensions of Input(Predicted)"
+                        "is [%d]",
+                        pred_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(pred_dims[1],
+                      1,
                       phi::errors::InvalidArgument(
-                          "Input(X/Y).dims()[dim] should be equal to 3."
-                          "But received Input(X/Y).dims()[dim] = %d.",
-                          x_dim[dim]));
+                          "Each row of Input(Predicted) contains a real value, "
+                          "so the 2nd dimension of Input(X) must be 1,"
+                          "But got [%d]",
+                          pred_dims[1]));
   }
-  out->set_dims(x_dim);
+  out->set_dims({pred_dims[0], 1});
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out) {
+  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
+  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
+  auto ndims_x = dims_x.size();
+  auto ndims_y = dims_y.size();
+  PADDLE_ENFORCE_GT(ndims_x,
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The Input(x) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_GT(ndims_y,
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "The Input(y) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+
+  bool x_broadcasted = false, y_broadcasted = false;
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
+
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
+
+  size_t M, N;
+  if (trans_x) {
+    M = dims_x[ndims_x - 1];
+  } else {
+    M = dims_x[ndims_x - 2];
+  }
+  if (trans_y) {
+    N = dims_y[ndims_y - 2];
+  } else {
+    N = dims_y[ndims_y - 1];
+  }
+
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+    }
+  }
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
+  }
+
+  auto ddim_out = phi::make_ddim(new_dims);
+
+  out->set_dims(ddim_out);
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
-  out->share_lod(x);
 }
 
-void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  out->share_meta(x);
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_vec = vec.dims();
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      2,
+      phi::errors::InvalidArgument("The rank of input X should be 2, but is %d",
+                                   dim_x.size()));
+  PADDLE_ENFORCE_EQ(
+      dim_vec.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+  PADDLE_ENFORCE_EQ(dim_x[1],
+                    dim_vec[0],
+                    phi::errors::InvalidArgument(
+                        "X's second dimension is expected to be equal to "
+                        "Vec's first dimension"
+                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        dim_x,
+                        dim_vec));
+
+  auto dim_out = phi::make_ddim({dim_x[0]});
+
+  out->set_dims(dim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
 }
 
-void BCELossInferMeta(const MetaTensor& input,
-                      const MetaTensor& label,
-                      MetaTensor* out,
-                      MetaConfig config) {
-  auto input_dims = input.dims();
-  auto label_dims = label.dims();
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config) {
+  auto dims = x.dims();
+  dims[0] = -1;
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
 
-  int rank = input_dims.size();
+  if (pooltype == "MEAN") {
+    summed_ids->set_dims({-1, 1});
+    summed_ids->set_dtype(x.dtype());
+    summed_ids->set_layout(x.layout());
+  }
+}
+
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config) {
+  auto x_dims = x.dims();
+  auto labels_dims = label.dims();
+  int rank = x_dims.size();
   PADDLE_ENFORCE_EQ(rank,
-                    label_dims.size(),
+                    labels_dims.size(),
                     phi::errors::InvalidArgument(
                         "Input(X) and Input(Label) shall have the same rank."
                         "But received: the rank of Input(X) is [%d], "
                         "the rank of Input(Label) is [%d].",
                         rank,
-                        label_dims.size()));
+                        labels_dims.size()));
 
   bool check = true;
   if ((!config.is_runtime) &&
-      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+      (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
     check = false;
   }
 
   if (check) {
-    PADDLE_ENFORCE_EQ(input_dims,
-                      label_dims,
-                      phi::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same "
-                          "shape. But received: the shape of Input(X) is "
-                          "[%s], the shape of Input(Label) is [%s].",
-                          input_dims,
-                          label_dims));
+    PADDLE_ENFORCE_EQ(
+        phi::slice_ddim(x_dims, 0, rank),
+        phi::slice_ddim(labels_dims, 0, rank),
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Label) shall have the same shape "
+            "except the last dimension. But received: the shape of "
+            "Input(X) is [%s], the shape of Input(Label) is [%s].",
+            x_dims,
+            labels_dims));
   }
 
-  out->set_dims(input_dims);
-  out->set_dtype(input.dtype());
-  out->share_lod(input);
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor X's dimensions of TriangularSolveOp "
+                        "should be >= 2. But received X's "
+                        "dimensions = %d, X's shape = [%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor Y's dimensions of TriangularSolveOp "
+                        "should be >=2. But received Y's "
+                        "dimensions = %d, Y's shape = [%s]",
+                        y_dims.size(),
+                        y_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
+                    x_dims[x_dims_n - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        x_dims[x_dims_n - 2],
+                        x_dims[x_dims_n - 1]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+  y_broadcast_dims.insert(y_broadcast_dims.end(),
+                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+
+  // dim of 'out' is the same with 'Y' after broadcast
+  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dtype(y.dtype());
+  out->set_layout(y.layout());
+  out->share_lod(y);
 }
 
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index a0140c9a5799f79af541b45847d5e44f982a3f58..cfae45cf04b87c287a174d172700a794c8c2a2a3 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -29,13 +29,48 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 
-void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+void AllValueCompareInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              MetaTensor* out,
+                              MetaConfig config = MetaConfig());
 
-void MatmulInferMeta(const MetaTensor& x,
-                     const MetaTensor& y,
-                     bool trans_x,
-                     bool trans_y,
-                     MetaTensor* out);
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out);
+
+void CholeskySolveInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            bool upper,
+                            MetaTensor* out);
+
+void CompareAllInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         MetaTensor* out);
+
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      int axis,
+                      MetaTensor* out);
+
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out);
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out);
+
+void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
@@ -46,6 +81,14 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void GatherNdInferMeta(const MetaTensor& x,
+                       const MetaTensor& index,
+                       MetaTensor* out);
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
+
 void HuberLossInferMeta(const MetaTensor& input_meta,
                         const MetaTensor& label_meta,
                         float delta,
@@ -58,14 +101,39 @@ void IndexSampleInferMeta(const MetaTensor& x,
                           MetaTensor* out,
                           MetaConfig config = MetaConfig());
 
-void CrossInferMeta(const MetaTensor& x,
-                    const MetaTensor& y,
-                    int axis,
-                    MetaTensor* out);
-
-void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
-void BCELossInferMeta(const MetaTensor& input,
+void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
+                      float epsilon,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
+
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out);
+
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config = MetaConfig());
+
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config = MetaConfig());
+
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index dc5478e8afb981defa9bc493cb440cead4f5965f..84441ed8b740be172ddaa7de3fc23ad420ebf077 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -28,6 +28,178 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config) {
+  auto predict_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_GE(
+      predict_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape size must be "
+          "greater_equal 2.",
+          predict_dims));
+  auto predict_width = predict_dims[1];
+  PADDLE_ENFORCE_NE(
+      phi::product(predict_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape can not involes 0.",
+          predict_dims));
+  PADDLE_ENFORCE_NE(
+      phi::product(label_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Label) has not been initialized properly. The "
+          "shape of Input(Label) = [%s], the shape can not involes 0.",
+          label_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_LE(
+        predict_width,
+        2,
+        phi::errors::InvalidArgument("Only support binary classification,"
+                                     "prediction dims[1] should be 1 or 2"));
+  }
+  auto predict_height = input.dims()[0];
+  auto label_height = label.dims()[0];
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        predict_height,
+        label_height,
+        phi::errors::InvalidArgument("Out and Label should have same height."));
+  }
+
+  int num_pred_buckets = num_thresholds + 1;
+
+  PADDLE_ENFORCE_GE(
+      num_pred_buckets,
+      1,
+      phi::errors::InvalidArgument("num_thresholds must larger than 1"));
+  PADDLE_ENFORCE_GE(
+      slide_steps,
+      0,
+      phi::errors::InvalidArgument("slide_steps must be natural number"));
+
+  auc->set_dims({1});
+  auc->set_dtype(DataType::INT64);
+
+  if (slide_steps) {
+    stat_pos_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_neg_out->set_dtype(DataType::INT64);
+  } else {
+    stat_pos_out->set_dims({1, num_pred_buckets});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({1, num_pred_buckets});
+    stat_neg_out->set_dtype(DataType::INT64);
+  }
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -197,6 +369,81 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
   out->share_lod(*x.at(0));
 }
 
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out) {
+  auto input_dims = x.dims();
+  auto rois_dims = rois.dims();
+
+  PADDLE_ENFORCE_EQ(
+      input_dims.size(),
+      4,
+      errors::InvalidArgument("The format of input tensor is NCHW"));
+  PADDLE_ENFORCE_EQ(rois_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  PADDLE_ENFORCE_EQ(rois_dims[1],
+                    4,
+                    errors::InvalidArgument(
+                        "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                        "given as [(x1, y1, x2, y2), ...]"));
+  if (rois_num.get_ptr()) {
+    auto rois_num_dims = rois_num->dims();
+    PADDLE_ENFORCE_EQ(
+        rois_num_dims.size(),
+        1,
+        errors::InvalidArgument("The second dimension of RoisNum should "
+                                "be 1, but received dimension is %d",
+                                rois_num_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      input_dims[1],
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "the channel of X(%d) "
+          "should be equal to the product of "
+          "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+          input_dims[1],
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output height must be greater than 0"));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    errors::InvalidArgument(
+                        "The pooled output width must be greater than 0"));
+  PADDLE_ENFORCE_GT(output_channels,
+                    1,
+                    errors::InvalidArgument(
+                        "The pooled output channels must greater than 1"));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      errors::InvalidArgument("The spatial scale must greater than 0."));
+
+  auto out_dims = input_dims;
+  out_dims[0] = rois_dims[0];
+  out_dims[1] =
+      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 51738c5e08e9842c7cffcdd1a2ce7ee3764d6412..c11843212ed33fd8170e6677a4d6e0ad95b730dc 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -20,6 +20,41 @@ namespace phi {
 
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -35,8 +70,18 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void PsroiPoolInferMeta(const MetaTensor& x,
+                        const MetaTensor& rois,
+                        paddle::optional<const MetaTensor&> rois_num,
+                        int pooled_height,
+                        int pooled_width,
+                        int output_channels,
+                        float spatial_scale,
+                        MetaTensor* out);
+
 void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
                     MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 1fdf8a6940a68f1ffaea9803e151983869e2ab7f..081084567e840f287bb113ee567888f4032f5638 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -16,6 +16,12 @@ limitations under the License. */
 
 namespace phi {
 
+void CreateInferMeta(const ScalarArray& shape,
+                     DataType dtype,
+                     MetaTensor* out) {
+  CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
+}
+
 void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
@@ -26,10 +32,37 @@ void CreateInferMetaBase(const std::vector<int64_t>& shape,
   out->set_layout(layout);
 }
 
-void CreateInferMeta(const ScalarArray& shape,
-                     DataType dtype,
-                     MetaTensor* out) {
-  CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
+void EyeInferMeta(int64_t num_rows,
+                  int64_t num_columns,
+                  DataType dtype,
+                  MetaTensor* out) {
+  if (num_columns == -1) num_columns = num_rows;
+  out->set_dims({num_rows, num_columns});
+  out->set_dtype(dtype);
+}
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape.GetData());
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index ea5bb71551b4218a897b73f8c2c7a6ef7b139544..38eaa636f8c8779c5a1f597b8cfb23ce6efc5edc 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -28,11 +28,30 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 
+void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
+
 void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
                          MetaTensor* out);
 
-void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
+void EyeInferMeta(int64_t num_rows,
+                  int64_t num_columns,
+                  DataType dtype,
+                  MetaTensor* out);
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out);
+
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 52aeaef8438548542e2ecac4219f6eb2a8e8462b..235cfe368c1921eac546b670470963fb49100290 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -18,6 +18,58 @@ limitations under the License. */
 
 namespace phi {
 
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config) {
+  auto inference_dim = out.dims();
+  auto label_dim = label.dims();
+  // Assume indices has same shape as inference, because
+  // it's the output of topk.
+  PADDLE_ENFORCE_EQ(
+      label_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: label's dimensions of AccuracyOp must be 2. "
+          "But received label's dimensions = %d, label's shape = [%s]",
+          label_dim.size(),
+          label_dim));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(label_dim[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: label's second dimension of "
+                          "AccuracyOp must be 1. But received label's "
+                          "second dimension is = %d, label's shape = [%s]",
+                          label_dim[1],
+                          label_dim));
+    PADDLE_ENFORCE_EQ(
+        inference_dim[0],
+        label_dim[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: the output's num_rows of AccuracyOp must be"
+            " the same as label's num_rows. But received output's "
+            "shape = [%s], label's shape = [%s], output's num_rows = %d, "
+            "label's "
+            "num_rows = %d",
+            inference_dim,
+            label_dim,
+            inference_dim[0],
+            label_dim[0]));
+  }
+
+  accuracy->set_dims({1});
+  accuracy->set_dtype(out.dtype());
+  correct->set_dims({1});
+  correct->set_dtype(out.dtype());
+  total->set_dims({1});
+  total->set_dtype(out.dtype());
+  accuracy->share_lod(out);
+}
+
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
                     const MetaTensor& y,
@@ -89,4 +141,335 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count) {
+  auto src_index_dims = src_index.dims();
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  auto dims = x.dims();
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+
+  if (pool_type == "MEAN") {
+    dst_count->set_dims({dims[0]});
+    dst_count->set_dtype(DataType::INT32);
+  }
+}
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto w_dims = weight.dims();
+  DDim out_dims;
+  out_dims = funcs::GetOutputDims(x_dims, y_dims);
+  if (w_dims.size() > 1 || w_dims[0] != 1) {
+    out_dims = funcs::GetOutputDims(out_dims, w_dims);
+  }
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto step_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (step_dims.size() == 1) && (step_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   step_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config) {
+  auto x_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The tensor rank of Input(X) must be 2 or 4."));
+  bool contain_unknown_dim =
+      phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(label_dims);
+  bool check = config.is_runtime || !contain_unknown_dim;
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        x_dims[0],
+        label_dims[0],
+        phi::errors::InvalidArgument(
+            "ShapeError: Expected input batch_size to match label batch_size,"
+            "But received: the Input(x) batch_size is [%s], the Input(label) "
+            " batch_size is [%s].",
+            x_dims[0],
+            label_dims[0]));
+    if (weight.get_ptr() != nullptr) {
+      auto w_dims = weight->dims();
+      PADDLE_ENFORCE_EQ(
+          w_dims.size(),
+          1,
+          phi::errors::InvalidArgument("Input(Weight) should be a 1D tensor."));
+      PADDLE_ENFORCE_EQ(
+          x_dims[1],
+          w_dims[0],
+          phi::errors::InvalidArgument(
+              "Expected input tensor Weight's size should equal "
+              "to the first dimension of the input tensor X. But received "
+              "Weight's "
+              "size is %d, the first dimension of input X is %d",
+              w_dims[0],
+              x_dims[1]));
+    }
+  }
+  if (x_dims.size() == 2) {
+    if (reduction == "none") {
+      out->set_dims({x_dims[0]});
+    } else {
+      out->set_dims({1});
+    }
+  } else if (x_dims.size() == 4) {
+    PADDLE_ENFORCE_EQ(label_dims.size(),
+                      3,
+                      phi::errors::InvalidArgument(
+                          "Expected Input(Lable) dimensions=3, received %d.",
+                          label_dims.size()));
+    auto input0 = x_dims[0];
+    auto input2 = x_dims[2];
+    auto input3 = x_dims[3];
+    auto label0 = label_dims[0];
+    auto label1 = label_dims[1];
+    auto label2 = label_dims[2];
+    PADDLE_ENFORCE_EQ(
+        input0 == label0 && input2 == label1 && input3 == label2,
+        true,
+        phi::errors::InvalidArgument("Input(X) tensor shape should "
+                                     "match to Input(Label) tensor "
+                                     "shape."));
+    if (reduction == "none") {
+      out->set_dims({x_dims[0], x_dims[2], x_dims[3]});
+    } else {
+      out->set_dims({1});
+    }
+  }
+  total_weight->set_dims({1});
+  out->set_dtype(input.dtype());
+  total_weight->set_dtype(input.dtype());
+}
+
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out) {
+  const auto& updates_dims = updates.dims();
+  const auto& ref_dims = x.dims();
+  const auto& index_dims = index.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The size of Input(Ids)'s shape should be equal to 1, but "
+          "received the rank of Input(Ids) is %d.",
+          index_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      ref_dims.size(),
+      updates_dims.size(),
+      phi::errors::InvalidArgument(
+          "Input(X) and Input(Updates) should have the same shape size, "
+          "but received the size of Input(x)'s shape is %d, the size of "
+          "Input(Updates)'s shape is %d.",
+          ref_dims.size(),
+          updates_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      updates_dims[0],
+      index_dims[0],
+      phi::errors::InvalidArgument(
+          "Input(Updates) and Input(Ids) should have same batch-size, but"
+          " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+          "batch-size is %d.",
+          updates_dims[0],
+          index_dims[0]));
+  out->set_dims(ref_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void ScatterNdAddInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& updates,
+                           MetaTensor* out) {
+  const auto& ref_dims = x.dims();
+  auto ref_dims_size = ref_dims.size();
+  const auto& index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+  const auto& updates_dims = updates.dims();
+  auto updates_dims_size = updates_dims.size();
+
+  PADDLE_ENFORCE_LE(
+      index_dims[index_dims_size - 1],
+      ref_dims_size,
+      phi::errors::InvalidArgument(
+          "The last dimension of Input(Index)'s shape should be no greater "
+          "than the rank of Input(X), but received the last dimension of "
+          "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+          index_dims[index_dims_size - 1],
+          ref_dims_size));
+  PADDLE_ENFORCE_GE(index_dims_size,
+                    2UL,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Index) should be greater than 1, "
+                        "but received the rank of Input(Index) is %d.",
+                        index_dims_size));
+
+  // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
+  std::vector<int64_t> r_updates_dims;
+  for (int64_t i = 0; i < index_dims_size - 1; ++i) {
+    r_updates_dims.emplace_back(index_dims[i]);
+  }
+  for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) {
+    r_updates_dims.emplace_back(ref_dims[i]);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      r_updates_dims.size(),
+      updates_dims_size,
+      phi::errors::InvalidArgument(
+          "Updates has wrong shape. The shape of Updates and Input(Updates) "
+          "should be same, but received the shape of Updates is %d, "
+          "the shape of Input(Updates) is %d.",
+          r_updates_dims.size(),
+          updates_dims_size));
+
+  for (int64_t i = 0; i < updates_dims_size; ++i) {
+    PADDLE_ENFORCE_EQ(
+        r_updates_dims[i],
+        updates_dims[i],
+        phi::errors::InvalidArgument(
+            "Updates has wrong shape. The dimensions of Updates and "
+            "Input(Updates) should match, but received Updates's"
+            "%d-th dimension is %d, Input(Updates)'s %d-th "
+            "dimension is %d.",
+            i,
+            r_updates_dims[i],
+            i,
+            updates_dims[i]));
+  }
+  out->set_dims(ref_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input in ViterbiDecode  must be 3. But "
+                        "received Input's rank is %d.",
+                        in_dims.size()));
+  auto length_dims = length.dims();
+  PADDLE_ENFORCE_EQ(length_dims.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The rank of Length in ViterbiDecode must be 1. But "
+                        "received Length's rank is %d.",
+                        length_dims.size()));
+  auto transition_dims = transition.dims();
+  PADDLE_ENFORCE_EQ(
+      transition_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Transition in ViterbiDecode must be 2. But "
+          "received Transition's rank is %d.",
+          transition_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        in_dims[0],
+        length_dims[0],
+        phi::errors::InvalidArgument(
+            "The batch size of Input and Length should be equal."));
+    PADDLE_ENFORCE_EQ(in_dims[2],
+                      transition_dims[0],
+                      phi::errors::InvalidArgument(
+                          "The number of tags of Input (%d) and Transition "
+                          "(%d) should be equal.",
+                          transition_dims[0],
+                          in_dims[2]));
+  }
+  scores->set_dims(length_dims);
+  scores->set_dtype(length.dtype());
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index d6223dd87aaf8e8c20c00ad72523e160ee15faee..209a07db18b5c7a87ba094c5839149533757220d 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -29,6 +29,14 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+void AccuracyInferMeta(const MetaTensor& out,
+                       const MetaTensor& indice,
+                       const MetaTensor& label,
+                       MetaTensor* accuracy,
+                       MetaTensor* correct,
+                       MetaTensor* total,
+                       MetaConfig config = MetaConfig());
 
 void AddmmInferMeta(const MetaTensor& input,
                     const MetaTensor& x,
@@ -37,4 +45,49 @@ void AddmmInferMeta(const MetaTensor& input,
                     float beta,
                     MetaTensor* out);
 
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count);
+
+void LerpInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   const MetaTensor& weight,
+                   MetaTensor* out);
+
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out);
+
+void NllLossRawInferMeta(const MetaTensor& input,
+                         const MetaTensor& label,
+                         paddle::optional<const MetaTensor&> weight,
+                         int64_t ignore_index,
+                         const std::string& reduction,
+                         MetaTensor* out,
+                         MetaTensor* total_weight,
+                         MetaConfig config = MetaConfig());
+
+void ScatterInferMeta(const MetaTensor& x,
+                      const MetaTensor& index,
+                      const MetaTensor& updates,
+                      bool overwrite,
+                      MetaTensor* out);
+
+void ScatterNdAddInferMeta(const MetaTensor& x,
+                           const MetaTensor& index,
+                           const MetaTensor& updates,
+                           MetaTensor* out);
+
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index fbd9259a83f86a137544f3b91cd1b2a469696e19..4d1cb42bd59f072e5926b237528a742c231bcdcf 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -16,48 +16,326 @@ limitations under the License. */
 
 #include <algorithm>
 #include <set>
+
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 
 namespace phi {
 
-void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->share_meta(x);
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  const auto& x_dims = x.dims();
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -x_dims.size(),
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -Rank(X)(%d).",
+                                   axis,
+                                   -x_dims.size()));
+  PADDLE_ENFORCE_LT(axis,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+                        axis,
+                        x_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      (dtype < 0 || dtype == 2 || dtype == 3),
+      true,
+      phi::errors::InvalidArgument(
+          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+          "received [%s]",
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT32),
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT64),
+          paddle::framework::DataTypeToString(
+              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
+
+  auto x_rank = x_dims.size();
+  if (axis < 0) axis += x_rank;
+  if (config.is_runtime) {
+    if (dtype == paddle::framework::proto::VarType::INT32) {
+      int64_t all_element_num = 0;
+      if (flatten) {
+        all_element_num = phi::product(x_dims);
+
+      } else {
+        all_element_num = x_dims[axis];
+      }
+      PADDLE_ENFORCE_LE(
+          all_element_num,
+          INT_MAX,
+          phi::errors::InvalidArgument(
+              "The element num of the argmin/argmax input at axis is "
+              "%d, is larger than int32 maximum value:%d, you must "
+              "set the dtype of argmin/argmax to 'int64'.",
+              all_element_num,
+              INT_MAX));
+    }
+  }
+  std::vector<int64_t> vec;
+  if (flatten) {
+    vec.emplace_back(static_cast<int64_t>(1));
+  } else {
+    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
+  }
+  out->set_dims(phi::make_ddim(vec));
+  if (dtype == 2) {
+    out->set_dtype(DataType::INT32);
+  } else if (dtype == 3) {
+    out->set_dtype(DataType::INT64);
+  }
 }
 
-// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
-void UnchangedInferMetaCheckAxis(const MetaTensor& x,
-                                 int axis,
-                                 MetaTensor* out) {
-  auto rank = x.dims().size();
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices) {
+  auto in_dims = input.dims();
+  auto num_dims = in_dims.size();
   PADDLE_ENFORCE_GE(
       axis,
-      -rank,
-      errors::InvalidArgument(
-          "Attr(axis) value should be in range [-R, R-1], "
-          "R is the rank of Input(X). But received axis: %d, R: %d.",
-          axis,
-          rank));
+      -num_dims,
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -num_dims(%d).",
+                                   axis,
+                                   -num_dims));
   PADDLE_ENFORCE_LT(
       axis,
-      rank,
+      num_dims,
       phi::errors::InvalidArgument(
-          "Attr(axis) value should be in range [-R, R-1], "
-          "R is the rank of Input(X). But received axis: %d, R: %d.",
-          axis,
-          rank));
-  out->share_meta(x);
+          "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
+
+  output->share_dims(input);
+  output->set_dtype(input.dtype());
+  indices->share_dims(input);
+  indices->set_dtype(DataType::INT64);
+  output->share_lod(input);
+  indices->share_lod(input);
 }
 
-void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
   out->set_dims(x.dims());
-  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_dtype(out_dtype);
+  out->set_layout(x.layout());
+}
+
+void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {
+  auto dims = x.dims();
+  auto rank = dims.size();
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions. But "
+                        "received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      dims[rank - 2],
+      dims[rank - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) all should be symmetric "
+          "positive-definite matrices and have the same size. But received "
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          dims[rank - 2],
+          dims[rank - 1]));
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+}
+
+void CopyToInferMeta(const MetaTensor& x,
+                     Backend backend,
+                     bool blocking,
+                     MetaTensor* out) {
+  UnchangedInferMeta(x, out);
+}
+
+void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype == DataType::UNDEFINED ? x.dtype() : dtype);
   out->set_layout(x.layout());
 }
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out) {
+  auto x_dims = x.dims();
+  if (flatten) {
+    out->set_dims(phi::make_ddim({phi::product(x_dims)}));
+    out->set_dtype(x.dtype());
+  } else {
+    out->set_dims(x_dims);
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+
+  if (x_dims.size() == 1UL) {
+    int64_t size_ = x_dims[0] + std::abs(offset);
+    out->set_dims({size_, size_});
+    out->set_dtype(x.dtype());
+  } else if (x_dims.size() == 2UL) {
+    int64_t size_ = 0;
+    if (offset >= 0) {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] < x_dims[1] - offset) {
+        size_ = x_dims[0];
+      } else {
+        size_ = x_dims[1] - offset;
+      }
+    } else {
+      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+      // of `size_` will have unexpected result on Windows Python3.8
+      if (x_dims[0] + offset < x_dims[1]) {
+        size_ = x_dims[0] + offset;
+      } else {
+        size_ = x_dims[1];
+      }
+    }
+    out->set_dims({size_});
+    out->set_dtype(x.dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+        "2, but received %d.",
+        x_dims.size()));
+  }
+}
+
+void DiagonalInferMeta(const MetaTensor& input,
+                       int offset,
+                       int axis1,
+                       int axis2,
+                       MetaTensor* out) {
+  auto x_dims = input.dims();
+  int offset_ = offset;
+  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
+  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::OutOfRange("Input's dim is out of range (expected at "
+                              "least 2 dimensions, but got %ld).",
+                              x_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis1_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis1) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis1));
+  PADDLE_ENFORCE_LT(
+      axis2_,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Attr(axis2) is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size()),
+          (x_dims.size() - 1),
+          axis2));
+  PADDLE_ENFORCE_NE(
+      axis1_,
+      axis2_,
+      phi::errors::InvalidArgument("The dimensions should not be identical "
+                                   "%d vs %d.",
+                                   axis1,
+                                   axis2));
+
+  auto out_dims = vectorize(x_dims);
+  // from out_dims get the dim size of axis1_.
+  auto axis1_size = out_dims[axis1_];
+  auto axis2_size = out_dims[axis2_];
+  // delete two dims by attr axis1 and axis2 from out_dims.
+  /* example:
+     out_dim = [2, 3, 4];
+     axis1 = 0;
+     axis2 = 1;
+     according to the attr of axis1 and axis2, we get:
+     out_dim = [4].
+  */
+  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
+  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+
+  if (offset_ == 0) {
+    out_dims.push_back(std::min(axis1_size, axis2_size));
+  } else if (offset_ > 0) {
+    if ((axis2_size - offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size, axis2_size - offset_));
+    } else {
+      out_dims.push_back(0);
+    }
+  } else {
+    if ((axis1_size + offset_) > 0) {
+      out_dims.push_back(std::min(axis1_size + offset_, axis2_size));
+    } else {
+      out_dims.push_back(0);
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v) {
+  auto input_dim = x.dims();
+  auto rank = input_dim.size();
+
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions."
+                        "But received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      input_dim[rank - 2],
+      input_dim[rank - 1],
+      phi::errors::InvalidArgument(
+          "Eigh op is designed for square matrix, consequently"
+          "inner-most 2 dimensions of Input(X) should be symmetric."
+          "But received X's shape[-2] = %d and shape[-1] = %d.",
+          input_dim[rank - 2],
+          input_dim[rank - 1]));
+
+  std::vector<int64_t> values_dim;
+
+  for (auto i = 0; i < rank - 1; i++) {
+    values_dim.emplace_back(input_dim[i]);
+  }
+  out_w->set_dims(phi::make_ddim(values_dim));
+  out_v->set_dims(input_dim);
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
@@ -114,47 +392,6 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x,
   UnchangedInferMetaCheckAxis(x, axis, out);
 }
 
-void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->set_dtype(out_dtype);
-  out->set_layout(x.layout());
-}
-
-void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {
-  auto dims = x.dims();
-  auto rank = dims.size();
-  PADDLE_ENFORCE_GE(rank,
-                    2,
-                    errors::InvalidArgument(
-                        "The Input(X) should have at least 2 dimensions. But "
-                        "received a %d dimension tensor.",
-                        rank));
-  PADDLE_ENFORCE_EQ(
-      dims[rank - 2],
-      dims[rank - 1],
-      errors::InvalidArgument(
-          "The inner-most 2 dimensions of Input(X) all should be symmetric "
-          "positive-definite matrices and have the same size. But received "
-          "X's shape[-2] = %d and shape[-1] = %d.",
-          dims[rank - 2],
-          dims[rank - 1]));
-  out->set_dims(x.dims());
-  out->set_dtype(x.dtype());
-}
-
-void CopyToInferMeta(const MetaTensor& x,
-                     Backend backend,
-                     bool blocking,
-                     MetaTensor* out) {
-  UnchangedInferMeta(x, out);
-}
-
-void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->set_dtype(dtype == DataType::UNDEFINED ? x.dtype() : dtype);
-  out->set_layout(x.layout());
-}
-
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       product(x.dims()),
@@ -307,6 +544,88 @@ void InferMetaFromVecValue(const MetaTensor& x,
   }
 }
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
+void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
+void MaxPoolWithIndexInferMeta(const MetaTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               MetaTensor* out,
+                               MetaTensor* mask,
+                               MetaConfig config) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  auto x_dims = x.dims();
+
+  PADDLE_ENFORCE(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      errors::InvalidArgument(
+          "Pooling intput should be 4-D or 5-D tensor but received %dD-Tensor",
+          x_dims.size()));
+
+  if (global_pooling) {
+    kernel_size_.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(x_dims[i + 2]);
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() - kernel_size_.size(),
+      2U,
+      errors::InvalidArgument(
+          "The input size %d minus the kernel size %d should equal to 2.",
+          x_dims.size(),
+          kernel_size_.size()));
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      strides.size(),
+      errors::InvalidArgument(
+          "Strides size %d and pooling size %d should be the same.",
+          strides.size(),
+          kernel_size_.size()));
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      paddings_.size(),
+      errors::InvalidArgument(
+          "Paddings size %d and pooling size %d should be the same.",
+          paddings_.size(),
+          kernel_size_.size()));
+
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  if (adaptive) {
+    output_shape.insert(
+        output_shape.end(), kernel_size_.begin(), kernel_size_.end());
+  } else {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      if ((!config.is_runtime) && (x_dims[i + 2] < 0)) {
+        output_shape.push_back(x_dims[i + 2]);
+      } else {
+        output_shape.push_back(funcs::MaxPoolOutputSize(
+            x_dims[i + 2], kernel_size_[i], paddings_[i], strides[i]));
+      }
+    }
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(x.dtype());
+
+  mask->set_dims(make_ddim(output_shape));
+  mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -342,64 +661,215 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
-void ReshapeInferMeta(const MetaTensor& x,
-                      const ScalarArray& shape,
-                      MetaTensor* out,
-                      MetaConfig config) {
-  auto& shape_data = shape.GetData();
-  PADDLE_ENFORCE_NOT_NULL(out,
-                          phi::errors::InvalidArgument(
-                              "Output(Out) of ReshapeOp should not be null."));
-  if (!config.is_runtime && shape.FromTensor()) {
-    out->set_dims(phi::make_ddim(shape_data));
-    out->share_lod(x);
-    return;
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config) {
+  auto x_dim = input.dims();
+  PADDLE_ENFORCE_EQ(
+      static_cast<int>(paddings.size()),
+      x_dim.size() * 2,
+      phi::errors::InvalidArgument(
+          "Size of 'paddings' dimension should be equal to 2 * size of "
+          "Input(X)'s dimension, but received (size of 'paddings' dimension "
+          "is) %d vs (2 * size of Input(X)'s dimension is) %d.",
+          static_cast<int>(paddings.size()),
+          x_dim.size() * 2));
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_GE(paddings[i],
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The element of 'paddings' should >= 0, but "
+                          "received %d for index %d.",
+                          paddings[i],
+                          static_cast<int>(i)));
   }
-  PADDLE_ENFORCE_GT(shape_data.size(),
-                    0,
+  std::vector<int64_t> out_dims(x_dim.size());
+  for (int i = 0; i < x_dim.size(); ++i) {
+    if ((!config.is_runtime) && (x_dim[i] == -1)) {
+      out_dims[i] = -1;
+    } else {
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+  if (out_dims[0] == x_dim[0]) {
+    // Only pass LoD when the first dimension is equal between
+    // output and input.
+    out->share_lod(input);
+  }
+  out->set_dtype(input.dtype());
+}
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
                     phi::errors::InvalidArgument(
-                        "The shape's size in ReshapeOp can't be zero."));
-  InferMetaFromVecValue(x, shape_data, out);
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        input_dims.size()));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  if (!channel_last) {
+    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The square of upscale_factor[%u] should divide the "
+                          "number of channel[%u]",
+                          upscale_factor * upscale_factor,
+                          input_dims[3]));
+  }
+  auto output_dims = input_dims;
+  output_dims[0] = input_dims[0];
+  if (!channel_last) {
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+  } else {
+    output_dims[1] = input_dims[1] * upscale_factor;
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(output_dims);
 }
 
-void ReshapeWithXShapeInferMeta(const MetaTensor& x,
-                                const ScalarArray& shape,
-                                MetaTensor* xshape,
-                                MetaTensor* out,
-                                MetaConfig config) {
-  PADDLE_ENFORCE_NOT_NULL(
-      xshape,
-      phi::errors::InvalidArgument(
-          "Output(XShape) of ReshapeOp should not be null."));
-  const auto& x_dims = x.dims();
-  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    xshape_dims[i + 1] = x_dims[i];
+void PoolInferMeta(const MetaTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool ceil_mode,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   MetaTensor* out,
+                   MetaConfig config) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      true,
+      errors::InvalidArgument(
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
+          "received: %u-D Tensor and it's shape is [%s].",
+          x_dims.size(),
+          x_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims.size() - kernel_size_.size(),
+                    2U,
+                    errors::InvalidArgument(
+                        "the dimension of input minus the size of "
+                        "Attr(kernel_size_) must be euqal to 2 in Op(pool). "
+                        "But received: the dimension of input minus the size "
+                        "of Attr(kernel_size_) is %d, the "
+                        "input's dimension is %d, the shape of input "
+                        "is [%s], the Attr(kernel_size_)'s size is %d, the "
+                        "Attr(kernel_size_) is [%s].",
+                        x_dims.size() - kernel_size_.size(),
+                        x_dims.size(),
+                        x_dims,
+                        kernel_size_.size(),
+                        make_ddim(kernel_size_)));
+
+  PADDLE_ENFORCE_EQ(
+      kernel_size_.size(),
+      strides.size(),
+      errors::InvalidArgument(
+          "the size of Attr(kernel_size_) and Attr(strides) in "
+          "Op(pool) must be equal. "
+          "But received: Attr(kernel_size_)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].",
+          kernel_size_.size(),
+          strides.size(),
+          make_ddim(kernel_size_),
+          make_ddim(strides)));
+
+  // MKL-DNN Kernels are using NCHW order of dims description
+  // so we ignore data_format consideration for MKL-DNN kernel
+  const bool channel_last = (config.is_run_mkldnn_kernel == false) &&
+                            (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings if "SAME" or global_pooling
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
-  xshape->share_lod(x);
-  ReshapeInferMeta(x, shape, out, config);
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  std::vector<int64_t> output_shape;
+  if (adaptive) {
+    output_shape.insert(
+        output_shape.end(), kernel_size_.begin(), kernel_size_.end());
+  } else {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      if ((!config.is_runtime) && (data_dims[i] < 0)) {
+        output_shape.push_back(data_dims[i]);
+      } else {
+        output_shape.push_back(funcs::PoolOutputSize(data_dims[i],
+                                                     kernel_size_[i],
+                                                     paddings_[2 * i],
+                                                     paddings_[2 * i + 1],
+                                                     strides[i],
+                                                     ceil_mode));
+      }
+    }
+  }
+
+  // output_N = input_N
+  output_shape.insert(output_shape.begin(), x_dims[0]);
+  // output_C = input_C
+  if (channel_last) {
+    output_shape.push_back(x_dims[x_dims.size() - 1]);
+  } else {
+    output_shape.insert(output_shape.begin() + 1, x_dims[1]);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
 }
 
-/*  Why not use ReduceInferMetaBase directly?
-    Because we need make InferMetaFunction's args follow the design of api.yaml
-*/
-void SumInferMeta(const MetaTensor& x,
-                  const std::vector<int64_t>& axis,
-                  DataType dtype,
-                  bool keep_dim,
-                  MetaTensor* out) {
-  bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out);
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_layout(x.layout());
 }
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType dtype,
-                         MetaTensor* out) {
+DDim ReduceInferDim(const MetaTensor& x,
+                    const std::vector<int64_t>& axis,
+                    bool keep_dim,
+                    bool reduce_all) {
   auto x_rank = x.dims().size();
 
   std::vector<int64_t> formated_axis = axis;
@@ -462,45 +932,118 @@ void ReduceInferMetaBase(const MetaTensor& x,
   }
   DDim out_dim = phi::make_ddim(out_dim_vector);
 
-  DataType out_dtype;
-  if (dtype != DataType::UNDEFINED) {
-    out_dtype = dtype;
-  } else {
-    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
-        x.dtype() == DataType::INT64) {
-      out_dtype = DataType::INT64;
-    } else {
-      out_dtype = x.dtype();
-    }
-  }
+  return out_dim;
+}
+
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out) {
+  bool reduce_all = false;
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
+}
 
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
   out->set_dims(out_dim);
-  out->set_dtype(out_dtype);
+  out->set_dtype(x.dtype());
   out->set_layout(x.layout());
 }
 
-void MeanRawInferMeta(const MetaTensor& x,
-                      const std::vector<int64_t>& axis,
-                      bool keep_dim,
-                      bool reduce_all,
-                      MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+void ReshapeInferMeta(const MetaTensor& x,
+                      const ScalarArray& shape,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto& shape_data = shape.GetData();
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          phi::errors::InvalidArgument(
+                              "Output(Out) of ReshapeOp should not be null."));
+  if (!config.is_runtime && shape.FromTensor()) {
+    out->set_dims(phi::make_ddim(shape_data));
+    out->share_lod(x);
+    return;
+  }
+  PADDLE_ENFORCE_GT(shape_data.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The shape's size in ReshapeOp can't be zero."));
+  InferMetaFromVecValue(x, shape_data, out);
+}
+
+void ReshapeWithXShapeInferMeta(const MetaTensor& x,
+                                const ScalarArray& shape,
+                                MetaTensor* xshape,
+                                MetaTensor* out,
+                                MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      xshape,
+      phi::errors::InvalidArgument(
+          "Output(XShape) of ReshapeOp should not be null."));
+  const auto& x_dims = x.dims();
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  ReshapeInferMeta(x, shape, out, config);
+}
+
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  auto x_dims = in.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, "
+                                   "but the value given is %d.",
+                                   x_dims.size()));
+  if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) {
+    PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1],
+                      1U,
+                      phi::errors::InvalidArgument(
+                          "The last dimension of Input(X) should be 1, "
+                          "but the value given is %d.",
+                          x_dims[x_dims.size() - 1]));
+  }
+
+  out->set_dims(x_dims);
+  out->share_lod(in);
+  out->set_dtype(in.dtype());
+}
+
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  out->set_dtype(DataType::INT64);
+  out->set_dims({1});
 }
 
-void MeanInferMeta(const MetaTensor& x,
-                   const std::vector<int64_t>& axis,
-                   bool keep_dim,
-                   MetaTensor* out) {
-  bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
-}
+void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto rank_x = dim_x.size();
+  PADDLE_ENFORCE_GE(axis,
+                    -rank_x,
+                    phi::errors::InvalidArgument(
+                        "Attr(axis) value should be in range [-R, R-1], "
+                        "R is the rank of Input(X)."));
+  PADDLE_ENFORCE_LT(axis,
+                    rank_x,
+                    phi::errors::InvalidArgument(
+                        "Attr(axis) value should be in range [-R, R-1], "
+                        "R is the rank of Input(X)."));
 
-void TransferLayoutInferMeta(const MetaTensor& x,
-                             DataLayout layout,
-                             MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(x.dtype());
-  out->set_layout(layout);
+  out->share_lod(x);
 }
 
 void SplitInferMeta(const MetaTensor& x,
@@ -508,17 +1051,6 @@ void SplitInferMeta(const MetaTensor& x,
                     const Scalar& axis,
                     std::vector<MetaTensor*> out,
                     MetaConfig config) {
-  if (!config.is_runtime) {
-    if (axis.FromTensor() || num_or_sections.FromTensor()) {
-      auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
-      for (auto* item : out) {
-        item->set_dims(out_dims);
-        item->share_lod(x);
-      }
-      return;
-    }
-  }
-
   int axis_value = axis.to<int>();
   int rank = x.dims().size();
   PADDLE_ENFORCE_EQ(
@@ -533,34 +1065,27 @@ void SplitInferMeta(const MetaTensor& x,
     axis_value = axis_value + rank;
   }
 
-  std::vector<phi::DDim> out_dims(out.size(), x.dims());
-
   auto input_axis_dim = x.dims().at(axis_value);
   auto num_or_sections_data = num_or_sections.GetData();
+  // step1: get formated sections
+  std::vector<int64_t> sections;
   // num_or_sections is a number
   if (num_or_sections_data.size() == 1) {
-    if (config.is_runtime || input_axis_dim > 0) {
-      int num = num_or_sections_data.at(0);
-      PADDLE_ENFORCE_EQ(
-          input_axis_dim % num,
-          0,
-          phi::errors::InvalidArgument(
-              "The input's size along the split dimension "
-              "must be evenly divisible by Attr(num_or_sections). "
-              "But received Attr(num_or_sections) "
-              "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
-              num,
-              x.dims(),
-              axis_value));
+    int num = num_or_sections_data.at(0);
 
-      size_t out_axis_dim = input_axis_dim / num;
-      for (auto& out_dim : out_dims) {
-        out_dim[axis_value] = out_axis_dim;
-      }
-    } else {
-      for (auto& out_dim : out_dims) {
-        out_dim[axis_value] = -1;
-      }
+    PADDLE_ENFORCE_EQ(input_axis_dim % num,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The input's size along the split dimension "
+                          "must be evenly divisible by Attr(num_or_sections). "
+                          "But received Attr(num_or_sections) "
+                          "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
+                          num,
+                          x.dims(),
+                          axis_value));
+
+    for (int i = 0; i < num; ++i) {
+      sections.push_back(input_axis_dim / num);
     }
   } else {
     // num_or_sections is a sections
@@ -568,9 +1093,10 @@ void SplitInferMeta(const MetaTensor& x,
     int unknow_dim_idx = -1;
     int num_of_unknow = 0;
     int sum_of_section = 0;
-    std::vector<int64_t> sections = num_or_sections_data;
 
     for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
+      sections.push_back(num_or_sections_data[i]);
+
       if (num_or_sections_data[i] == unknow_dim_val) {
         num_of_unknow++;
         unknow_dim_idx = i;
@@ -622,42 +1148,137 @@ void SplitInferMeta(const MetaTensor& x,
               x.dims(),
               axis_value));
     }
-    for (size_t i = 0; i < out_dims.size(); ++i) {
+  }
+
+  // setp2: fill out dims
+  std::vector<phi::DDim> out_dims(sections.size(), x.dims());
+  if (config.is_runtime || input_axis_dim > 0) {
+    for (size_t i = 0; i < sections.size(); ++i) {
       out_dims[i][axis_value] = sections[i];
     }
+  } else {
+    for (size_t i = 0; i < sections.size(); ++i) {
+      out_dims[i][axis_value] = -1;
+    }
   }
 
-  for (size_t i = 0; i < out.size(); ++i) {
+  for (size_t i = 0; i < sections.size(); ++i) {
     if (axis_value != 0) {
       // Only pass LoD when not spliting along the first dim.
-      out.at(i)->set_dtype(x.dtype());
-      out.at(i)->set_dims(out_dims[i]);
-      out.at(i)->set_layout(x.layout());
+      out[i]->set_dtype(x.dtype());
+      out[i]->set_dims(out_dims[i]);
+      out[i]->set_layout(x.layout());
     } else {
-      out.at(i)->set_dtype(x.dtype());
-      out.at(i)->set_dims(out_dims[i]);
-      out.at(i)->set_layout(x.layout());
-      out.at(i)->share_lod(x);
+      out[i]->set_dtype(x.dtype());
+      out[i]->set_dims(out_dims[i]);
+      out[i]->set_layout(x.layout());
+      out[i]->share_lod(x);
     }
   }
 }
 
-void UnbindInferMeta(const MetaTensor& x,
-                     int axis,
-                     std::vector<MetaTensor>* outs) {
-  auto in_dims = x.dims();
-  std::vector<int> out_dim;
-  axis = axis < 0 ? in_dims.size() + axis : axis;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (i != axis) out_dim.push_back(in_dims[i]);
+/*  Why not use SumRawInferMeta directly?
+    Because we need make InferMetaFunction's args follow the design of api.yaml
+*/
+void SumInferMeta(const MetaTensor& x,
+                  const std::vector<int64_t>& axis,
+                  DataType dtype,
+                  bool keep_dim,
+                  MetaTensor* out) {
+  bool reduce_all = false;
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
+}
+
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+
+  DataType out_dtype;
+  if (dtype != DataType::UNDEFINED) {
+    out_dtype = dtype;
+  } else {
+    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
+        x.dtype() == DataType::INT64) {
+      out_dtype = DataType::INT64;
+    } else {
+      out_dtype = x.dtype();
+    }
   }
-  auto out_dims = phi::make_ddim(out_dim);
 
-  for (size_t i = 0; i < outs->size(); ++i) {
-    (*outs)[i].set_dtype(x.dtype());
-    (*outs)[i].set_dims(out_dims);
-    (*outs)[i].set_layout(x.layout());
-    (*outs)[i].share_lod(x);
+  out->set_dims(out_dim);
+  out->set_dtype(out_dtype);
+  out->set_layout(x.layout());
+}
+
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config) {
+#define MAX_RANK_SUPPORTED 6
+
+  auto repeat_times_data = repeat_times.GetData();
+  auto x_dims = x.dims();
+  if (repeat_times_data.size() == 0) {
+    repeat_times_data = std::vector<int64_t>(x_dims.size(), -1);
+  }
+
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      repeat_times_data.size(),
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must not be greater than %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          repeat_times_data.size()));
+  PADDLE_ENFORCE_GE(
+      repeat_times_data.size(),
+      1,
+      errors::InvalidArgument(
+          "The size of the shape of input 'repeat_times' for tile op "
+          "must be positive integers, but the value received is %d.",
+          repeat_times_data.size()));
+
+  auto out_rank =
+      std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
+  std::vector<int64_t> out_shape(out_rank);
+  auto x_dim_vec = phi::vectorize<int>(x_dims);
+  if (x_dim_vec.size() > repeat_times_data.size()) {
+    auto diff = x_dim_vec.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, -1);
+  } else {
+    auto diff = repeat_times_data.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+  }
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) {
+      out_shape[i] = -1;
+    } else {
+      PADDLE_ENFORCE_GT(
+          repeat_times_data[i],
+          0,
+          errors::InvalidArgument(
+              "Every element of the input 'repeat_times' for tile op must be "
+              "greater than 0, but the value given is %d.",
+              repeat_times_data[i]));
+      out_shape[i] = x_dim_vec[i] * repeat_times_data[i];
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_shape));
+  if (out_shape[0] == x_dims[0]) {
+    out->share_lod(x);
   }
 }
 
@@ -712,6 +1333,115 @@ void TraceInferMeta(
     sizes.erase(sizes.begin() + std::min(dim1_, dim2_));
   }
   out->set_dims(phi::make_ddim(sizes));
+  out->set_dtype(x.dtype());
+}
+
+void TransferLayoutInferMeta(const MetaTensor& x,
+                             DataLayout layout,
+                             MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->set_layout(layout);
+}
+
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out) {
+  auto x_dims = x.dims();
+  size_t x_rank = x_dims.size();
+  size_t axis_size = axis.size();
+
+  PADDLE_ENFORCE_EQ(
+      x_rank,
+      axis_size,
+      errors::InvalidArgument("The input tensor's dimension "
+                              "should be equal to the axis's size. "
+                              "But received input tensor's dimension is %d, "
+                              "axis's size is %d",
+                              x_rank,
+                              axis_size));
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_ENFORCE_GE(
+        axis[i],
+        0,
+        errors::InvalidArgument("The axis should be greater than or equal to 0."
+                                "But received %d of axis[%d]",
+                                axis[i],
+                                i));
+
+    PADDLE_ENFORCE_EQ(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        true,
+        errors::InvalidArgument(
+            "Each element of Attribute axis should "
+            "be a unique value range from 0 to (dims - 1), "
+            "where the dims is the axis's size, "
+            "unique value means this axis value can appear only once. "
+            "But received axis[%d] is %d, axis_size is %d, "
+            "count[axis[%d]] is %d",
+            i,
+            axis[i],
+            axis_size,
+            i,
+            count[axis[i]]));
+  }
+
+  phi::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; ++i) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs) {
+  auto in_dims = x.dims();
+  std::vector<int> out_dim;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i != axis) out_dim.push_back(in_dims[i]);
+  }
+  auto out_dims = phi::make_ddim(out_dim);
+
+  for (size_t i = 0; i < outs->size(); ++i) {
+    (*outs)[i].set_dtype(x.dtype());
+    (*outs)[i].set_dims(out_dims);
+    (*outs)[i].set_layout(x.layout());
+    (*outs)[i].share_lod(x);
+  }
+}
+
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->share_meta(x);
+}
+
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out) {
+  auto rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -rank,
+      errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  PADDLE_ENFORCE_LT(
+      axis,
+      rank,
+      phi::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis,
+          rank));
+  out->share_meta(x);
 }
 
 void UnfoldInferMeta(const MetaTensor& x,
@@ -872,94 +1602,52 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
-void DiagInferMeta(const MetaTensor& x,
-                   int offset,
-                   float padding_value,
-                   MetaTensor* out) {
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
+                        MetaTensor* out) {
   auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
 
-  if (x_dims.size() == 1UL) {
-    int64_t size_ = x_dims[0] + std::abs(offset);
-    out->set_dims({size_, size_});
-    out->set_dtype(x.dtype());
-  } else if (x_dims.size() == 2UL) {
-    int64_t size_ = 0;
-    if (offset >= 0) {
-      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-      // of `size_` will have unexpected result on Windows Python3.8
-      if (x_dims[0] < x_dims[1] - offset) {
-        size_ = x_dims[0];
-      } else {
-        size_ = x_dims[1] - offset;
-      }
-    } else {
-      // Note(LutaoChu): Do not use std::min here, otherwise the calculation
-      // of `size_` will have unexpected result on Windows Python3.8
-      if (x_dims[0] + offset < x_dims[1]) {
-        size_ = x_dims[0] + offset;
-      } else {
-        size_ = x_dims[1];
-      }
-    }
-    out->set_dims({size_});
-    out->set_dtype(x.dtype());
-  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "The input tensor X's dimensions of DiagV2Op should be either 1 or "
-        "2, but received %d.",
-        x_dims.size()));
-  }
-}
-
-void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
-  out->set_dtype(DataType::INT64);
-  out->set_dims({1});
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
+  out->set_dims(out_dims);
+  out->share_lod(x);
+  out->set_dtype(dtype);
 }
 
-void PixelShuffleInferMeta(const MetaTensor& x,
-                           int upscale_factor,
-                           const std::string& data_format,
-                           MetaTensor* out) {
-  auto input_dims = x.dims();
-  PADDLE_ENFORCE_EQ(input_dims.size(),
-                    4,
-                    phi::errors::InvalidArgument(
-                        "Input should be a 4-D tensor of format [N, C, H, W] "
-                        "or [N, H, W, C], but got %u.",
-                        input_dims.size()));
+void OneHotInferMeta(const MetaTensor& x,
+                     const Scalar& depth_t,
+                     MetaTensor* out) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
 
-  const bool channel_last = (data_format == "NHWC");
+  int depth = depth_t.to<int>();
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
+  out->set_dims(out_dims);
+  out->share_lod(x);
+  out->set_dtype(phi::DataType::FLOAT32);
+}
 
-  if (!channel_last) {
-    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          upscale_factor * upscale_factor,
-                          input_dims[1]));
-  } else {
-    PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          upscale_factor * upscale_factor,
-                          input_dims[3]));
-  }
-  auto output_dims = input_dims;
-  output_dims[0] = input_dims[0];
-  if (!channel_last) {
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
-  } else {
-    output_dims[1] = input_dims[1] * upscale_factor;
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
-  }
-  out->set_dtype(x.dtype());
-  out->set_dims(output_dims);
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
+  auto rank = condition.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1UL,
+      phi::errors::InvalidArgument(
+          "Input(Condition) should have number of dimension at least 1"));
+  out->set_dims(phi::make_ddim({-1, rank}));
+  out->set_dtype(DataType::INT64);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 3c0628981af7c92ff60e8199131b682e4f0f557e..75fb9fadf82dc87ac18814e0674e5012fea95ec4 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -32,14 +32,50 @@ class MetaConfig;
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
 
-void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
+
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices);
 
-// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
-void UnchangedInferMetaCheckAxis(const MetaTensor& x,
-                                 int axis,
-                                 MetaTensor* out);
+void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
-void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
+
+void CopyToInferMeta(const MetaTensor& x,
+                     Backend backend,
+                     bool blocking,
+                     MetaTensor* out);
+
+void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
+
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out);
+
+void DiagInferMeta(const MetaTensor& x,
+                   int offset,
+                   float padding_value,
+                   MetaTensor* out);
+
+void DiagonalInferMeta(
+    const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
+
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v);
 
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
@@ -52,28 +88,69 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x,
                             int axis,
                             MetaTensor* out);
 
-void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
-
-void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
-
-void CopyToInferMeta(const MetaTensor& x,
-                     Backend backend,
-                     bool blocking,
-                     MetaTensor* out);
-
-void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
-
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
 void InferMetaFromVecValue(const MetaTensor& x,
                            const std::vector<int64_t>& shape,
                            MetaTensor* out);
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
+
+void MaxPoolWithIndexInferMeta(const MetaTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               MetaTensor* out,
+                               MetaTensor* mask,
+                               MetaConfig config = MetaConfig());
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
 
+void PadInferMeta(const MetaTensor& input,
+                  const std::vector<int>& paddings,
+                  float pad_value,
+                  MetaTensor* out,
+                  MetaConfig config = MetaConfig());
+
+void PixelShuffleInferMeta(const MetaTensor& x,
+                           int upscale_factor,
+                           const std::string& data_format,
+                           MetaTensor* out);
+
+void PoolInferMeta(const MetaTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool ceil_mode,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out);
+
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out);
+
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
                       MetaTensor* out,
@@ -85,23 +162,23 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType dtype,
-                         MetaTensor* out);
+void ShardIndexInferMeta(const MetaTensor& in,
+                         int index_num,
+                         int nshards,
+                         int shard_id,
+                         int ignore_value,
+                         MetaTensor* out,
+                         MetaConfig config = MetaConfig());
 
-void MeanRawInferMeta(const MetaTensor& x,
-                      const std::vector<int64_t>& axis,
-                      bool keep_dim,
-                      bool reduce_all,
-                      MetaTensor* out);
+void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
 
-void MeanInferMeta(const MetaTensor& x,
-                   const std::vector<int64_t>& axis,
-                   bool keep_dim,
-                   MetaTensor* out);
+void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out);
+
+void SplitInferMeta(const MetaTensor& x_meta,
+                    const ScalarArray& num_or_sections,
+                    const Scalar& axis,
+                    std::vector<MetaTensor*> out,
+                    MetaConfig config = MetaConfig());
 
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
@@ -109,21 +186,39 @@ void SumInferMeta(const MetaTensor& x,
                   bool keep_dim,
                   MetaTensor* out);
 
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out);
+
+void TileInferMeta(const MetaTensor& x,
+                   const ScalarArray& repeat_times,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void TraceInferMeta(
+    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
+
 void TransferLayoutInferMeta(const MetaTensor& x,
                              DataLayout layout,
                              MetaTensor* out);
 
-void SplitInferMeta(const MetaTensor& x_meta,
-                    const ScalarArray& num_or_sections,
-                    const Scalar& axis,
-                    std::vector<MetaTensor*> out,
-                    MetaConfig config = MetaConfig());
+void TransposeInferMeta(const MetaTensor& x,
+                        const std::vector<int>& axis,
+                        MetaTensor* out);
 
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
                      std::vector<MetaTensor>* outs);
-void TraceInferMeta(
-    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
+
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
+
+// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
+void UnchangedInferMetaCheckAxis(const MetaTensor& x,
+                                 int axis,
+                                 MetaTensor* out);
 
 void UnfoldInferMeta(const MetaTensor& x,
                      const std::vector<int>& kernel_sizes,
@@ -133,16 +228,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
-void DiagInferMeta(const MetaTensor& x,
-                   int offset,
-                   float padding_value,
-                   MetaTensor* out);
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
+                        MetaTensor* out);
 
-void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
+void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out);
 
-void PixelShuffleInferMeta(const MetaTensor& x,
-                           int upscale_factor,
-                           const std::string& data_format,
-                           MetaTensor* out);
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ef51d6daf6a0052f39c2cf6253c208412cbb6904..d443b7bb2a09225e78a7001374821114c59a1557 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -9,22 +9,46 @@ add_subdirectory(funcs)
 # phi depends all phi kernel targets
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
+# [ 1. Common kernel compilation dependencies ]
+set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
-# NOTE: Some kernels depend on some targets that are not commonly used.
+# [ 2. Kernels that most kernels depend on ]
+# There are a few kernels that are very basic operations, and most of the
+# kernels depend on these kernels.
+set(COMMON_BAISC_KERNELS empty_kernel full_kernel)
+kernel_library(empty_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
+
+# [ 3. Kernels with special dependencies ]
+# Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel)
+set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel)
+kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
+kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling)
+kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
 
-# auto parse and build kernel targets by cmake
-register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
+# 4. auto parse and build kernel targets by cmake
+register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
 
 # phi sparse kernels
 add_subdirectory(sparse)
diff --git a/paddle/phi/kernels/accuracy_kernel.h b/paddle/phi/kernels/accuracy_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f2dbb96f86544882c3218a937225dd27978c15f
--- /dev/null
+++ b/paddle/phi/kernels/accuracy_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total);
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5b737b28c23ba97988915f00cbf447d2e1b1c22
--- /dev/null
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+  template <typename T, typename Context>         \
+  void name##GradKernel(const Context& dev_ctx,   \
+                        const DenseTensor& x,     \
+                        const DenseTensor& dout,  \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+  template <typename T, typename Context>           \
+  void name##GradKernel(const Context& dev_ctx,     \
+                        const DenseTensor& out,     \
+                        const DenseTensor& dout,    \
+                        DenseTensor* dx);
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx);
+
+template <typename T, typename Context>
+void BReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& dout,
+                     float t_min,
+                     float t_max,
+                     DenseTensor* dx);
+
+template <typename T, typename Context>
+void LeakyReluGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         float alpha,
+                         DenseTensor* dx);
+
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout);
+
+template <typename T, typename Context>
+void ThresholdedReluGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& dout,
+                               float threshold,
+                               DenseTensor* dx);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..885dccad8e377642b4cb9e36832ac4bd45f7915f
--- /dev/null
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_KERNEL(name)   \
+  template <typename T, typename Context> \
+  void name##Kernel(                      \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+DECLARE_ACTIVATION_KERNEL(Cos)
+DECLARE_ACTIVATION_KERNEL(Tan)
+DECLARE_ACTIVATION_KERNEL(Acos)
+DECLARE_ACTIVATION_KERNEL(Sin)
+DECLARE_ACTIVATION_KERNEL(Asin)
+DECLARE_ACTIVATION_KERNEL(Atan)
+DECLARE_ACTIVATION_KERNEL(Sinh)
+DECLARE_ACTIVATION_KERNEL(Cosh)
+DECLARE_ACTIVATION_KERNEL(Asinh)
+DECLARE_ACTIVATION_KERNEL(Acosh)
+DECLARE_ACTIVATION_KERNEL(Atanh)
+DECLARE_ACTIVATION_KERNEL(Relu)
+DECLARE_ACTIVATION_KERNEL(Tanh)
+
+template <typename T, typename Context>
+void BReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 float t_min,
+                 float t_max,
+                 DenseTensor* out);
+
+template <typename T, typename Context>
+void LeakyReluKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     float alpha,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void ThresholdedReluKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           float threshold,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..65a6aad415193be62424a6c6ac19c1aec6927e8b
--- /dev/null
+++ b/paddle/phi/kernels/adadelta_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adamax_kernel.h b/paddle/phi/kernels/adamax_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..feaf996f16266abbada655907fee68f4ab25bad3
--- /dev/null
+++ b/paddle/phi/kernels/adamax_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/allclose_kernel.h b/paddle/phi/kernels/allclose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f24078b86ca1736411cb929754441d737ee6028
--- /dev/null
+++ b/paddle/phi/kernels/allclose_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/arg_min_max_kernel.h b/paddle/phi/kernels/arg_min_max_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..917babeef07e99c9cdd6ab71e772a4e0cb1f9e12
--- /dev/null
+++ b/paddle/phi/kernels/arg_min_max_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_grad_kernel.h b/paddle/phi/kernels/argsort_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b91bd69911351dcd330325fabb515df0eebba29f
--- /dev/null
+++ b/paddle/phi/kernels/argsort_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_kernel.h b/paddle/phi/kernels/argsort_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..683e8631d2e3423f7fe81e10d82dbf701def5338
--- /dev/null
+++ b/paddle/phi/kernels/argsort_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..acbd17c7801e240e933798773d54a50822dec68f
--- /dev/null
+++ b/paddle/phi/kernels/auc_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AucKernel(const Context& dev_ctx,
+               const DenseTensor& input,
+               const DenseTensor& label,
+               const DenseTensor& stat_pos,
+               const DenseTensor& stat_neg,
+               const std::string& curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor* auc,
+               DenseTensor* stat_pos_out,
+               DenseTensor* stat_neg_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c15dbd2f63f588e0bb20ae41a08146e3ac781458
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& dev_ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ddf32e27c7d73a7249d92f7835afdf6b8f3ed5a
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_kernel.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ba69d365480f7c192836137ab22d1d2eb85ea03
--- /dev/null
+++ b/paddle/phi/kernels/bincount_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bitwise_kernel.h b/paddle/phi/kernels/bitwise_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..17307004f360e10ada708fba276ec8de1a129259
--- /dev/null
+++ b/paddle/phi/kernels/bitwise_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BitwiseAndKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseOrKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseXorKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
index 5ec2e35cc9b0cfe09fd281605984e72a603b8f5e..5d24f6684a48f2fb4a65673e0bb888a9f37b1246 100644
--- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
-                                const std::vector<DenseTensor>& dout,
+                                const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
index fb2a6f1136c26cb1bee1ca26ae7d214566862709..22b5201b6900dd18b645c2d7645adb96a6f11e91 100644
--- a/paddle/phi/kernels/broadcast_tensors_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsKernel(const Context& ctx,
-                            const std::vector<DenseTensor>& x,
+                            const std::vector<const DenseTensor*>& x,
                             std::vector<DenseTensor*> out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cast_kernel.h b/paddle/phi/kernels/cast_kernel.h
index c760b2842d0c97f0afd848ee3dcc333517349c9e..5e07388f5fb20d3a791bcf288e1e6597479e12c5 100644
--- a/paddle/phi/kernels/cast_kernel.h
+++ b/paddle/phi/kernels/cast_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Cast(const Context& dev_ctx,
                  const DenseTensor& x,
                  DataType out_dtype) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   CastInferMeta(x, out_dtype, &meta_out);
   CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
diff --git a/paddle/phi/kernels/cholesky_solve_grad_kernel.h b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2ce67abae6234d1bfbc919cef06250bcd7c6b6e
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_solve_kernel.h b/paddle/phi/kernels/cholesky_solve_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b304a20e611d1d417e25dccc946a7992f911155b
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_solve_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/compare_kernel.h b/paddle/phi/kernels/compare_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b6b8cd868f9fcb9048e00526b272e3cd4c54682
--- /dev/null
+++ b/paddle/phi/kernels/compare_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DECALRE_COMPARE_KERNEL(compare_kernel) \
+  template <typename T, typename Context>      \
+  void compare_kernel(const Context& ctx,      \
+                      const DenseTensor& x,    \
+                      const DenseTensor& y,    \
+                      int axis,                \
+                      DenseTensor* out);
+
+DECALRE_COMPARE_KERNEL(LessThanKernel)
+DECALRE_COMPARE_KERNEL(LessEqualKernel)
+DECALRE_COMPARE_KERNEL(GreaterThanKernel)
+DECALRE_COMPARE_KERNEL(GreaterEqualKernel)
+DECALRE_COMPARE_KERNEL(EqualKernel)
+DECALRE_COMPARE_KERNEL(NotEqualKernel)
+#undef DECALRE_COMPARE_KERNEL
+
+#define DECALRE_COMPARE_ALL_KERNEL(compare_all_kernel) \
+  template <typename T, typename Context>              \
+  void compare_all_kernel(const Context& ctx,          \
+                          const DenseTensor& x,        \
+                          const DenseTensor& y,        \
+                          DenseTensor* out);
+
+DECALRE_COMPARE_ALL_KERNEL(EqualAll)
+#undef DECALRE_COMPARE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 3b3003392d37f384416643a3b8a52b4a6809216d..07f93f9b926f174c374bbc20b7c655a65732423f 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -24,6 +24,12 @@ namespace phi {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
 // If T is complex
 template <
     typename T,
@@ -32,7 +38,7 @@ template <
                          std::is_same<T, phi::dtype::complex<double>>::value,
                      bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ConjKernel<T>(dev_ctx, x, &dense_out);
@@ -50,10 +56,56 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename Context>
-void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  RealKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
 
-template <typename T, typename Context>
-void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  ImagKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index f13667881468e15183c3d770df638641f1dc6ed0..cf83ab9aaabe135573a2887a01166f4a7bd0d5e1 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -22,25 +22,25 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis,
                   DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Concat(const Context& dev_ctx,
-                   const std::vector<DenseTensor>& x,
+                   const std::vector<const DenseTensor*>& x,
                    const Scalar& axis) {
   std::vector<MetaTensor> meta_x;
   meta_x.reserve(x.size());
   std::vector<MetaTensor*> meta_x_ptr;
-  for (const auto& t : x) {
-    meta_x.emplace_back(t);
+  for (const auto* t : x) {
+    meta_x.emplace_back(*t);
     meta_x_ptr.push_back(&meta_x.back());
   }
 
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..339f1c00eaa505cdcf8976abae92a6c93cfd50eb
--- /dev/null
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& paddding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& dev_ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::string& paddding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bad30989ac90d8b46fee99039c2772d00c7d939a
--- /dev/null
+++ b/paddle/phi/kernels/conv_grad_kernel.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings,
+                    const std::string& paddding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad);
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb0bfdd0275b5050054c620e722b0e7653fd678a
--- /dev/null
+++ b/paddle/phi/kernels/conv_kernel.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& paddding_algorithm,
+                int groups,
+                const std::vector<int>& dilations,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& paddding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c57ec69b73a230df48411f4074935e2bb4bce461
--- /dev/null
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  size_t num_samples = inference.dims()[0];
+  size_t class_dim = inference.dims()[1];
+  *accuracy_data = 0.0f;
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  int num_correct = 0;
+  // assume inference is already the topk of the output
+  for (size_t i = 0; i < num_samples; ++i) {
+    PADDLE_ENFORCE_GE(
+        label_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "label of AccuracyOp must >= 0, But received label[%d] is %d",
+            i,
+            label_data[i]));
+    for (size_t j = 0; j < class_dim; ++j) {
+      if (indices_data[i * class_dim + j] == label_data[i]) {
+        ++num_correct;
+        break;
+      }
+    }
+  }
+
+  *correct_data = num_correct;
+  *total_data = num_samples;
+  *accuracy_data =
+      static_cast<float>(num_correct) / static_cast<float>(num_samples);
+}
+}  // namespace phi
+
+// TODO(add supported dtype.)
+PD_REGISTER_KERNEL(
+    accuracy, CPU, ALL_LAYOUT, phi::AccuracyRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9af50f6832a1884f3ef58ccb5708b1f2636ccea
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class<T> functor;                                       \
+    ActivationGradImpl<T, Context, functor_class<T>>(               \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \
+    name, functor_class, attr)                          \
+  template <typename T, typename Context>               \
+  void name##GradKernel(const Context& dev_ctx,         \
+                        const DenseTensor& x,           \
+                        const DenseTensor& dout,        \
+                        float attr,                     \
+                        DenseTensor* dx) {              \
+    functor_class<T> functor;                           \
+    auto attrs = functor.GetAttrs();                    \
+    *(attrs[0].second) = attr;                          \
+    ActivationGradImpl<T, Context, functor_class<T>>(   \
+        dev_ctx, &x, nullptr, &dout, dx, functor);      \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \
+    name, functor_class, attr1, attr2)                  \
+  template <typename T, typename Context>               \
+  void name##GradKernel(const Context& dev_ctx,         \
+                        const DenseTensor& x,           \
+                        const DenseTensor& dout,        \
+                        float attr1,                    \
+                        float attr2,                    \
+                        DenseTensor* dx) {              \
+    functor_class<T> functor;                           \
+    auto attrs = functor.GetAttrs();                    \
+    *(attrs[0].second) = attr1;                         \
+    *(attrs[1].second) = attr2;                         \
+    ActivationGradImpl<T, Context, functor_class<T>>(   \
+        dev_ctx, &x, nullptr, &dout, dx, functor);      \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class<T> functor;                                         \
+    ActivationGradImpl<T, Context, functor_class<T>>(                 \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \
+    name, functor_class, attr)                            \
+  template <typename T, typename Context>                 \
+  void name##GradKernel(const Context& dev_ctx,           \
+                        const DenseTensor& out,           \
+                        const DenseTensor& dout,          \
+                        float attr,                       \
+                        DenseTensor* dx) {                \
+    functor_class<T> functor;                             \
+    auto attrs = functor.GetAttrs();                      \
+    *(attrs[0].second) = attr;                            \
+    ActivationGradImpl<T, Context, functor_class<T>>(     \
+        dev_ctx, nullptr, &out, &dout, dx, functor);      \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor);
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
+                                               funcs::LeakyReluGradFunctor,
+                                               alpha);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(
+    ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
+                                               funcs::BReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(                                         \
+      name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
+                                          ReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
+                                          TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
+                                          LeakyReluDoubleGradKernel)
+
+PD_REGISTER_KERNEL(tanh_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TanhTripleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d13429c8f651ccb40646fddd82a3529a95ab45d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
+  template <typename T, typename Context>                                \
+  void name##Kernel(                                                     \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
+    functor_class functor;                                               \
+    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr)     \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(const Context& dev_ctx,                                 \
+                    const DenseTensor& x,                                   \
+                    float attr,                                             \
+                    DenseTensor* out) {                                     \
+    functor_class<T> functor;                                               \
+    auto attrs = functor.GetAttrs();                                        \
+    *(attrs[0].second) = attr;                                              \
+    ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+  }
+
+#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(                               \
+    name, functor_class, attr1, attr2)                                      \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(const Context& dev_ctx,                                 \
+                    const DenseTensor& x,                                   \
+                    float attr1,                                            \
+                    float attr2,                                            \
+                    DenseTensor* out) {                                     \
+    functor_class<T> functor;                                               \
+    auto attrs = functor.GetAttrs();                                        \
+    *(attrs[0].second) = attr1;                                             \
+    *(attrs[1].second) = attr2;                                             \
+    ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor<T>)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     funcs::ThresholdedReluFunctor,
+                                     threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max)
+
+}  // namespace phi
+PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, Sin)
+PD_REGISTER_ACTIVATION_KERNEL(cos, Cos)
+PD_REGISTER_ACTIVATION_KERNEL(tan, Tan)
+PD_REGISTER_ACTIVATION_KERNEL(acos, Acos)
+PD_REGISTER_ACTIVATION_KERNEL(asin, Asin)
+PD_REGISTER_ACTIVATION_KERNEL(atan, Atan)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu)
diff --git a/paddle/phi/kernels/cpu/adadelta_kernel.cc b/paddle/phi/kernels/cpu/adadelta_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b5397b616d7208969636d2f3114ecc46611d7b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adadelta_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, CPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/adamax_kernel.cc b/paddle/phi/kernels/cpu/adamax_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..867c900e70b687995930de697e1a9ee4c426e255
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adamax_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, CPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ffeadfeed8aaa60c13b651188c5099c949b98ab
--- /dev/null
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/allclose_kernel.h"
+
+#include <cmath>
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Rtol) type must be double, but get %s.", rtol.dtype()));
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Atol) type must be double, but get %s.", atol.dtype()));
+
+  auto* in_a = x.data<T>();
+  auto* in_b = y.data<T>();
+  auto rtol_v = rtol.to<double>();
+  auto atol_v = atol.to<double>();
+  auto* out_data = dev_ctx.template Alloc<bool>(out);
+  *out_data = true;
+
+  auto num = x.numel();
+  for (int64_t i = 0; i < num; ++i) {
+    const T a = in_a[i], b = in_b[i];
+    bool val;
+    if (std::isnan(a) || std::isnan(b)) {
+      val = equal_nan && std::isnan(a) == std::isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol_v + (b > 0 ? rtol_v * b : (-rtol_v) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    *out_data &= val;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    allclose, CPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4ad830e149321f417392f6e83bbc8cc06ad3876
--- /dev/null
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename Context,
+          typename T,
+          typename Tout,
+          int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)  \
+  template <typename Context, typename T, typename Tout, int64_t Rank>    \
+  struct ArgMinMaxFunctor<Context, T, Tout, Rank, enum_argminmax_value> { \
+    void operator()(const Context& dev_ctx,                               \
+                    const DenseTensor& in,                                \
+                    DenseTensor* out,                                     \
+                    phi::DDim x_dims,                                     \
+                    int64_t axis,                                         \
+                    bool keepdims) {                                      \
+      auto in_eigen = EigenTensor<T, Rank>::From(in, x_dims);             \
+      if (keepdims) {                                                     \
+        auto out_eigen = EigenTensor<Tout, Rank>::From(*out);             \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      } else {                                                            \
+        auto out_eigen = EigenTensor<Tout, Rank - 1>::From(*out);         \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      }                                                                   \
+    }                                                                     \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+struct VisitDataArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataArgMinMaxFunctor(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     int64_t axis,
+                                     bool keepdims,
+                                     bool flatten,
+                                     DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+  template <typename Tout>
+  void apply() const {
+    dev_ctx.template Alloc<Tout>(out);
+    bool new_keepdims = keepdims;
+    if (flatten) new_keepdims = true;
+
+    // if flatten, will construct the new dims for the cacluate
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x_dims.size();
+    }
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
+  ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
+  functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
+
+    switch (x_dims.size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            x_dims.size(),
+            6,
+            phi::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+void ArgMinMaxKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int64_t axis,
+                     bool keepdims,
+                     bool flatten,
+                     int dtype,
+                     DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e60847232c70b2af9382331809d73a7208fc956
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullAssign(Type input_height,
+                       Type input_width,
+                       int input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* t_out) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  auto in_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  dev_ctx.template Alloc<T>(in_grad);
+  auto dxt = EigenVector<T>::Flatten(*in_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  // Do full assign
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &out_grad,
+                           &indices,
+                           in_grad->data<T>());
+  } else {
+    // If not full assign do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &trans_dO,
+                           &trans_ind,
+                           t_out);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(argsort_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_kernel.cc b/paddle/phi/kernels/cpu/argsort_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e69afe38c9ad4d1ccfbd42fcde06562d97a4e3f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullSort(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     bool descending) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                if (descending)
+                  return l.first > r.first;
+                else
+                  return l.first < r.first;
+              });
+
+    for (Type j = 0; j < input_width; ++j) {
+      t_out[i * input_width + j] = col_vec[j].first;
+      t_indices[i * input_width + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  T* out_data = dev_ctx.template Alloc<T>(output);
+
+  // Do full sort
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &input,
+                         out_data,
+                         ids_data,
+                         descending);
+  } else {
+    // If not full sort do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_dims);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         descending);
+
+    dev_ctx.template Alloc<int64_t>(indices);
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    argsort, CPU, ALL_LAYOUT, phi::ArgsortKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/auc_kernel.cc b/paddle/phi/kernels/cpu/auc_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc25091de757d029ced9babab9bcc55f1d2a10a6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/auc_kernel.cc
@@ -0,0 +1,190 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/auc_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) {
+  return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+}
+
+template <typename T>
+void statAuc(const DenseTensor &label,
+             const DenseTensor &predict,
+             const int num_thresholds,
+             const int slide_steps,
+             int64_t *origin_stat_pos,
+             int64_t *origin_stat_neg) {
+  size_t batch_size = predict.dims()[0];
+  size_t inference_width = predict.dims()[1];
+  const T *inference_data = predict.data<T>();
+  const auto *label_data = label.data<int64_t>();
+  const int bucket_length = num_thresholds + 1;
+  if (slide_steps == 0) {
+    for (size_t i = 0; i < batch_size; i++) {
+      // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
+      // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
+      auto predict_data =
+          inference_data[i * inference_width + (inference_width - 1)];
+      PADDLE_ENFORCE_LE(predict_data,
+                        1,
+                        phi::errors::PreconditionNotMet(
+                            "The predict data must less or equal 1."));
+      PADDLE_ENFORCE_GE(predict_data,
+                        0,
+                        phi::errors::PreconditionNotMet(
+                            "The predict data must gather or equal 0."));
+
+      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+      if (label_data[i] > 0) {
+        origin_stat_pos[binIdx] += 1;
+      } else if (label_data[i] == 0) {
+        origin_stat_neg[binIdx] += 1;
+      }
+    }
+    return;
+  }
+  // the last number of origin_stat_pos store the index should be used in
+  // current step
+  int cur_step_index =
+      static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
+      slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  for (int i = 0; i < bucket_length; ++i) {
+    origin_stat_pos[sum_step_begin + i] -= origin_stat_pos[cur_step_begin + i];
+    origin_stat_neg[sum_step_begin + i] -= origin_stat_neg[cur_step_begin + i];
+  }
+
+  std::memset(
+      origin_stat_pos + cur_step_begin, 0, bucket_length * sizeof(int64_t));
+  std::memset(
+      origin_stat_neg + cur_step_begin, 0, bucket_length * sizeof(int64_t));
+
+  for (size_t i = 0; i < batch_size; i++) {
+    // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
+    // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
+    auto predict_data =
+        inference_data[i * inference_width + (inference_width - 1)];
+    PADDLE_ENFORCE_LE(predict_data,
+                      1,
+                      phi::errors::PreconditionNotMet(
+                          "The predict data must less or equal 1."));
+    PADDLE_ENFORCE_GE(predict_data,
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "The predict data must gather or equal 0."));
+
+    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+    if (label_data[i] > 0) {
+      origin_stat_pos[cur_step_begin + binIdx] += 1;
+    } else if (label_data[i] == 0) {
+      origin_stat_neg[cur_step_begin + binIdx] += 1;
+    }
+  }
+  for (int i = 0; i < bucket_length; ++i) {
+    origin_stat_pos[sum_step_begin + i] += origin_stat_pos[cur_step_begin + i];
+    origin_stat_neg[sum_step_begin + i] += origin_stat_neg[cur_step_begin + i];
+  }
+}
+
+inline static void calcAuc(const int64_t *stat_pos,
+                           const int64_t *stat_neg,
+                           int num_thresholds,
+                           double *auc) {
+  *auc = 0.0f;
+
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+
+  int idx = num_thresholds;
+
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += stat_pos[idx];
+    totNeg += stat_neg[idx];
+    *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    *auc = *auc / totPos / totNeg;
+  }
+}
+
+template <typename T, typename Context>
+void AucKernel(const Context &dev_ctx,
+               const DenseTensor &input,
+               const DenseTensor &label,
+               const DenseTensor &stat_pos,
+               const DenseTensor &stat_neg,
+               const std::string &curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor *auc,
+               DenseTensor *stat_pos_out,
+               DenseTensor *stat_neg_out) {
+  // Only use output var for now, make sure it's persistable and
+  // not cleaned up for each batch.
+  auto *origin_stat_pos = dev_ctx.template Alloc<int64_t>(stat_pos_out);
+  auto *origin_stat_neg = dev_ctx.template Alloc<int64_t>(stat_neg_out);
+  auto *auc_value = dev_ctx.template Alloc<double>(auc);
+
+  // Just for pass UT, since UT's input & output connot be set same var
+  auto *stat_pos_in_tensor = &stat_pos;
+  auto *stat_neg_in_tensor = &stat_neg;
+  auto *pos_in_data = stat_pos.data<int64_t>();
+  auto *neg_in_data = stat_neg.data<int64_t>();
+  if (stat_pos_in_tensor != stat_pos_out) {
+    memcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t));
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    memcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t));
+  }
+  statAuc<T>(label,
+             input,
+             num_thresholds,
+             slide_steps,
+             origin_stat_pos,
+             origin_stat_neg);
+
+  int sum_offset = slide_steps * (num_thresholds + 1);
+  calcAuc(origin_stat_pos + sum_offset,
+          origin_stat_neg + sum_offset,
+          num_thresholds,
+          auc_value);
+  if (slide_steps) {
+    origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1;
+    origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(auc, CPU, ALL_LAYOUT, phi::AucKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de2343a384a5b413591fed981dc03e97bbb89fed
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -0,0 +1,674 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad) {
+  const auto* d_y = &y_grad;
+
+  DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  // batch_norm with inplace as false will take X as grad input, which
+  // is same as cuDNN batch_norm backward calculation, batch_norm
+  // with inplace as true only take Y as input and X should be calculate
+  // by inverse operation of batch_norm on Y
+
+  if (is_inplace) {
+    if (d_x) {
+      PADDLE_ENFORCE_EQ(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  } else {
+    if (d_x) {
+      PADDLE_ENFORCE_NE(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  }
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded as
+  // NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  T* d_bias_data = nullptr;
+  T* d_scale_data = nullptr;
+  if (d_scale && d_bias) {
+    d_bias_data = ctx.template Alloc<T>(d_bias);
+    d_scale_data = ctx.template Alloc<T>(d_scale);
+  }
+
+  // d_bias = np.sum(d_y, axis=0)
+  // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+  //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+  EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+  EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
+
+  if (d_scale && d_bias) {
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+  }
+
+  if (d_x && (N * sample_size) == 1 && !use_global_stats) {
+    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    return;
+  }
+
+  int scale_coefff = use_global_stats ? 1 : N * sample_size;
+  const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
+
+  DenseTensor dy_sum;
+  dy_sum.Resize({C});
+  auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
+  EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
+
+  DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
+  dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
+  auto dy_mul_x_sub_mean_mul_invstd_sum_data =
+      ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
+  EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
+      dy_mul_x_sub_mean_mul_invstd_sum_data, C);
+
+  dy_sum_arr.setZero();
+  dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
+
+  // inplace calculation
+  // Y:  ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  // X: (y - bias) / scale / (inv_var) + est_mean
+  //   formula transform ====>
+  //    (y - bias) / (scale * inv_var) + est_mean
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
+                               scale_inv_var_nhw(nc % C) / scale_coefff +
+                           mean_arr(nc % C);
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dy_sum_arr(c) += d_y_arr.col(nc).sum();
+        dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
+            ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                .sum();
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), sample_size, N * C);
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) =
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
+        }
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
+        for (int nhw = 0; nhw < N * sample_size; nhw++) {
+          x_data.col(nhw) =
+              (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
+              mean_arr;
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+
+      for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+        dy_sum_arr += d_y_arr.col(nhw);
+        dy_mul_x_sub_mean_mul_invstd_sum_arr +=
+            (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), C, N * sample_size);
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) =
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
+                 (x_arr.col(nhw) - mean_arr) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
+        }
+      }
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                data_layout_str));
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad) {
+  const auto* X = &x;
+  const auto* Scale = &scale;
+  const auto* dY = &y_grad;
+  const auto* Saved_mean = &saved_mean;
+  const auto* Saved_variance = &saved_variance;
+
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const auto data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto* ddX = &x_grad_grad;
+  const auto* ddScale = &scale_grad_grad;
+  const auto* ddBias = &bias_grad_grad;
+
+  auto* dX = x_grad;
+  auto* dScale = scale_grad;
+  auto* ddY = y_grad_grad;
+  ctx.template Alloc<T>(dX);
+  ctx.template Alloc<T>(ddY);
+
+  const auto& x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = X->numel() / C;
+  phi::funcs::SetConstant<Context, T> set_constant;
+
+  const T* mean_data = Saved_mean->data<T>();
+  const T* inv_var_data = Saved_variance->data<T>();
+
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  // transpose NCHW -> NHWC for easy calculate
+  DenseTensor transformed_x(X->type());
+  DenseTensor transformed_dy(dY->type());
+  DenseTensor transformed_ddx(ddX->type());
+
+  DenseTensor transformed_dx(dX->type());
+  DenseTensor transformed_ddy(ddY->type());
+  if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    // Input Tensor
+    ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
+    TransToChannelLast<Context, T>(ctx, X, &transformed_x);
+    ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    // Output Tensor
+    ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
+    ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
+  } else {
+    transformed_x.ShareDataWith(*X);
+    transformed_dy.ShareDataWith(*dY);
+    transformed_ddx.ShareDataWith(*ddX);
+
+    transformed_dx.ShareDataWith(*dX);
+    transformed_ddy.ShareDataWith(*ddY);
+  }
+
+  ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  Tensor mean_tile;
+  mean_tile.Resize({C, sample_size});
+  EigenArrayMap<T> mean_tile_data(
+      ctx.template Alloc<T>(&mean_tile), C, sample_size);
+
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({C, sample_size});
+  EigenArrayMap<T> inv_var_tile_data(
+      ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
+
+  mean_tile_data = mean_arr.replicate(1, sample_size);
+  inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    ctx.template Alloc<T>(&Scale_data);
+    set_constant(ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  Tensor scale_tile;
+  scale_tile.Resize({C, sample_size});
+  EigenArrayMap<T> scale_tile_data(
+      ctx.template Alloc<T>(&scale_tile), C, sample_size);
+  scale_tile_data = scale_arr.replicate(1, sample_size);
+
+  ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+  ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({C, sample_size});
+
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dX) {
+    ctx.template Alloc<T>(dX);
+    EigenArrayMap<T> dx_arr(
+        ctx.template Alloc<T>(&transformed_dx), C, sample_size);
+    dx_arr.setZero();
+    if (use_global_stats) {
+      // math: dx = (ddscale * dy) * inv_var
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+      }
+    } else {
+      // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+      // axis=(n,h,w)) *
+      //          np.sum(dy, axis=(n,h,w)) -
+      //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+      //          mean),
+      //          axis=(n,h,w)) * inv_var.pow(2) *
+      //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+      //          NxHxW *
+      //          np.sum(ddx * (x - mean)) *
+      //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+      //          np.sum(dy,
+      //          axis=(n,h,w)) * (x - mean) *
+      //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
+      //          inv_var
+      //          *
+      //          np.mean(dy, axis=(n,h,w)) -
+      //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+      //          axis=(n,h,w)))
+
+      if (ddX) {
+        dx_arr +=
+            (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+             sample_size)
+                .colwise() *
+            (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+             (dy_arr * ddx_arr).rowwise().sum() +
+             3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                 sample_size);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size *
+                  (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+        dx_arr = scale_tile_data * dx_arr;
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr +=
+            (dy_arr * inv_var_tile_data -
+             (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
+                 inv_var_tile_data -
+             x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size) *
+            ddscale_tile_data;
+      }
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
+    }
+  }
+  if (dScale) {
+    EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
+    dscale_arr.setZero();
+    if (use_global_stats) {
+      // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+      if (ddX) {
+        dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+      }
+    } else {
+      // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+      //            ddx
+      if (ddX) {
+        Tensor first_grad;
+        first_grad.Resize({C, sample_size});
+        EigenArrayMap<T> first_grad_arr(
+            ctx.template Alloc<T>(&first_grad), C, sample_size);
+        first_grad_arr.setZero();
+
+        first_grad_arr +=
+            inv_var_tile_data *
+            (dy_arr -
+             dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+        dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+      }
+    }
+  }
+
+  if (ddY) {
+    ctx.template Alloc<T>(ddY);
+    EigenArrayMap<T> ddy_arr(
+        ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
+    ddy_arr.setZero();
+    if (use_global_stats) {
+      // math: ddy = r * ddx * inv_var + ddbias +
+      //           ddscale * (x - mean) * inv_var
+      if (ddX) {
+        ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+      }
+    } else {
+      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+      //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+      if (ddX) {
+        ddy_arr +=
+            scale_tile_data * inv_var_tile_data *
+            (ddx_arr -
+             ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+      }
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      Tensor ddscale_tile;
+      ddscale_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddscale_tile_data(
+          ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+      ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+    }
+
+    if (ddBias) {
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      Tensor ddbias_tile;
+      ddbias_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddbias_tile_data(
+          ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
+      ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+      ddy_arr += ddbias_tile_data;
+    }
+
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..743128e8dea99296def25f79c47bbcfda8c65f40
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space) {
+  bool test_mode = is_test && (!trainable_statistics);
+
+  bool global_stats = test_mode || use_global_stats;
+
+  auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensionss is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // alloc memory
+  ctx.template Alloc<T>(y);
+  ctx.template Alloc<T>(mean_out);
+  ctx.template Alloc<T>(variance_out);
+  ctx.template Alloc<T>(saved_mean);
+  ctx.template Alloc<T>(saved_variance);
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded
+  // as NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  if (!global_stats) {
+    // saved_xx is use just in this batch of data
+    EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
+    EigenVectorArrayMap<T> saved_variance_e(
+        ctx.template Alloc<T>(saved_variance), C);
+    saved_mean_e.setZero();
+    saved_variance_e.setZero();
+
+    EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
+    EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
+                                           C);
+
+    if ((N * sample_size) == 1) {
+      // Only 1 element in normalization dimension,
+      // we skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      return;
+    }
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_mean_e(nc % C) += x_arr.col(nc).sum();
+        }
+        saved_mean_e /= N * sample_size;
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_variance_e(nc % C) +=
+              (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_mean_e += x_arr.col(i);
+        }
+        saved_mean_e /= N * sample_size;
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_variance_e +=
+              (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                  data_layout_str));
+    }
+
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    running_mean_arr =
+        running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+    running_var_arr =
+        running_var_arr * momentum + saved_variance_e * (1. - momentum);
+  }
+
+  // use SavedMean and SavedVariance to do normalize
+  Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+  if (global_stats) {
+    ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
+    inv_std = (var_arr + epsilon).sqrt().inverse();
+  } else {
+    EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
+    // inverse SavedVariance first, gradient will use it too.
+    saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+    inv_std = saved_inv_std;
+  }
+  ConstEigenVectorArrayMap<T> mean_arr(
+      global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
+
+  //   ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+  Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+      bias_arr - mean_arr * inv_std * scale_arr;
+
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
+          (ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
+           new_scale)
+              .colwise() +
+          new_bias;
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
+                                                data_layout));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9dc44c1e04eb750b7b35eba7241f4d4af40062a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bincount_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename InputT>
+void BincountInner(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const paddle::optional<const DenseTensor&> weights,
+                   int minlength,
+                   DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  auto input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<InputT>(output);
+    return;
+  }
+
+  PADDLE_ENFORCE_GE(
+      *std::min_element(input_data, input_data + input_numel),
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size = static_cast<int64_t>(*std::max_element(
+                            input_data, input_data + input_numel)) +
+                        1L;
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  if (has_weights) {
+    const T* weights_data = weights->data<T>();
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
+      }
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
+      }
+    }
+
+  } else {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+    for (int64_t i = 0; i < input_numel; i++) {
+      output_data[input_data[i]] += 1L;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/bitwise_kernel.cc b/paddle/phi/kernels/cpu/bitwise_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69f52790f77969e8bf29fcb50b777afe504215b7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bitwise_kernel.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/bitwise_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/bitwise_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_BITWISE_KERNEL(op_type)                                    \
+  template <typename T, typename Context>                                 \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,                   \
+                                const DenseTensor& x,                     \
+                                const DenseTensor& y,                     \
+                                DenseTensor* out) {                       \
+    funcs::Bitwise##op_type##Functor<T> func;                             \
+    funcs::ElementwiseCompute<funcs::Bitwise##op_type##Functor<T>, T, T>( \
+        dev_ctx, x, y, -1, func, out);                                    \
+  }
+
+DEFINE_BITWISE_KERNEL(And)
+DEFINE_BITWISE_KERNEL(Or)
+DEFINE_BITWISE_KERNEL(Xor)
+#undef DEFINE_BITWISE_KERNEL
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  size_t numel = x.numel();
+  funcs::BitwiseNotFunctor<T> func;
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x_data, x_data + numel, out_data, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bitwise_and,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseAndKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_or,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseOrKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_xor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseXorKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_not,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseNotKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 7a97f8c2189736452a722882f8d86a6cfaeae0f5..0869cd62024dc9e11ce1e1a1fc5349c0c966ef9e 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -59,7 +59,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
-                                const std::vector<DenseTensor>& dout,
+                                const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx) {
   // Find reduce dimensions
   const auto& in_tensors = dout;
@@ -85,7 +85,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
   // For each In-Out tensor pair,
   // Prepare and apply broadcast dims array
   for (size_t i = 0; i < num_ins; i++) {
-    const auto* input_tensor = &in_tensors[i];
+    const auto* input_tensor = in_tensors[i];
     auto* output_tensor = out_tensors[i];
 
     const auto& input_dims = input_tensor->dims();
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f5dd29ba2b7a31b8346d3d25148852344221a0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02597560a7f51f2df6173215b09536292a70ef3f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+class CholeskySolveFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    char uplo = upper ? 'U' : 'L';
+    funcs::lapackCholeskySolve<T>(uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky_solve, CPU, ALL_LAYOUT, phi::CholeskySolveKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9006325a521ec88eba80a9bf27762730be08fecc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/compare_kernel.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out) {
+  ctx.template Alloc<bool>(out);
+  if (x.dims().size() >= y.dims().size()) {
+    funcs::ElementwiseCompute<Functor, T, bool>(
+        ctx, x, y, axis, Functor(), out);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T, bool>(
+        ctx, x, y, axis, InverseFunctor(), out);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  bool* out_data = ctx.template Alloc<bool>(out);
+
+  if (x.dims() != y.dims()) {
+    out_data[0] = false;
+  } else {
+    DenseTensor tmp;
+    tmp.Resize(x.dims());
+    ctx.template Alloc<bool>(&tmp);
+
+    if (x.numel() == 1 && y.numel() == 1) {
+      bool* tmp_data = tmp.data<bool>();
+      tmp_data[0] = Functor()(x.data<T>()[0], y.data<T>()[0]);
+    } else {
+      funcs::ElementwiseCompute<Functor, T, bool>(
+          ctx, x, y, 0, Functor(), &tmp);
+    }
+    auto tmp_flat = EigenVector<bool>::Flatten(tmp);
+    auto out_es = EigenScalar<bool>::From(*out);
+    auto& place = *ctx.eigen_device();
+    auto reduce_dim = Eigen::array<int, 1>({{0}});
+    out_es.device(place) = tmp_flat.all(reduce_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(less_than,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LessThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(less_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LessEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_than,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GreaterThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GreaterEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(not_equal,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NotEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(equal_all,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EqualAllKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 5c4202837c4487361f33b849df7d975e85f8490d..6be825d4ef14e8e9aabf9c1b5b804c3ff5a18347 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -29,17 +29,17 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis_scalar,
                   DenseTensor* out) {
   int64_t axis = axis_scalar.to<int64_t>();
 
-  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
 
   std::vector<phi::DDim> x_dims;
   x_dims.reserve(x.size());
   for (size_t i = 0; i < x.size(); ++i) {
-    x_dims.push_back(x[i].dims());
+    x_dims.push_back(x[i]->dims());
   }
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
@@ -47,13 +47,13 @@ void ConcatKernel(const Context& dev_ctx,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
-  if (axis == 0 && x[0].lod().size() > 0) {
-    size_t lod_size_0 = x[0].lod().size();
+  if (axis == 0 && x[0]->lod().size() > 0) {
+    size_t lod_size_0 = x[0]->lod().size();
     size_t lod_size = lod_size_0;
     for (size_t i = 1; i < x.size(); ++i) {
-      if (x[i].lod().size() > 0) {
+      if (x[i]->lod().size() > 0) {
         PADDLE_ENFORCE_EQ(
-            x[i].lod().size(),
+            x[i]->lod().size(),
             lod_size_0,
             phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
@@ -61,7 +61,7 @@ void ConcatKernel(const Context& dev_ctx,
                 "it is not supported currently. The lod level of %dth input "
                 "is %d and first input is %d.",
                 i,
-                x[i].lod().size(),
+                x[i]->lod().size(),
                 lod_size_0));
       } else {
         lod_size = 0;
@@ -71,7 +71,7 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod());
+        auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod());
         phi::AppendLoD(out_lod, in_lod);
       }
     }
@@ -80,28 +80,29 @@ void ConcatKernel(const Context& dev_ctx,
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && x.size() < 10) {
     size_t output_offset = 0;
-    for (auto& in : x) {
-      if (in.numel() == 0UL) {
+    for (const auto* in : x) {
+      if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in.dims());
+      auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
       paddle::operators::StridedNumelCopyWithAxis<T>(
           dev_ctx,
           axis,
           out->data<T>() + output_offset,
           out_stride,
-          in.data<T>(),
+          in->data<T>(),
           in_stride,
           in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
+    // TODO(chenweihang): concat functor support vector<DenseTensor*> input
     std::vector<phi::DenseTensor> inputs;
     inputs.reserve(x.size());
     for (size_t j = 0; j < x.size(); ++j) {
-      if (x[j].numel() > 0) {
-        inputs.emplace_back(x[j]);
+      if (x[j]->numel() > 0) {
+        inputs.emplace_back(*x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f157bb017f81c5636a87ddcecb82e977e4fd18ba
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings_t,
+                          const std::string& padding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations_t,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search_t,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad) {
+  ConvGradGradKernel<T>(ctx,
+                        input_grad_grad,
+                        filter_grad_grad,
+                        out_grad,
+                        input,
+                        filter,
+                        strides,
+                        paddings_t,
+                        padding_algorithm,
+                        groups,
+                        dilations_t,
+                        data_format,
+                        use_addto,
+                        workspace_size_MB,
+                        exhaustive_search_t,
+                        out_grad_grad,
+                        input_grad,
+                        filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3DGradGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..994ad861bd15b747524d7e0f47a0ed5b8ee465cd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(
+    conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0b4ee7d5776fdaf51955b2d35bb339735411a28
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d26d89086b27e3db8ccfcf339c51c6a2fdf1988a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+template <typename T = int>
+inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
+                                     std::vector<T>* dilation,
+                                     const std::string padding_algorithm,
+                                     const DDim data_dims,
+                                     const std::vector<T>& strides,
+                                     const std::vector<T>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2,
+        paddings->size(),
+        phi::errors::InvalidArgument(
+            "Attribute padding's size should be the same or twice as the "
+            "input's dimension. "
+            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "dimension is %d, input's shape is [%s].",
+            paddings->size(),
+            make_ddim(*paddings),
+            data_dims.size(),
+            data_dims));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+
+      // dilation
+      *(dilation->begin() + i) = 1;
+    }
+
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  if (paddings.size() != strides.size()) {
+    for (size_t j = 0; j < paddings.size(); ++j) {
+      padding_0 = padding_0 && (paddings[j] == 0);
+    }
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a25f9650fc50fefa3899da13b55b985c164a394a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& d_out,
+                       int dim,
+                       DenseTensor* d_x) {
+  DDim shape = x.dims();
+
+  auto* d_out_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* out_data = out.data<T>();
+  auto* d_x_data = dev_ctx.template Alloc<T>(d_x);
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  // deal with complex
+  const T* x_data_deal;
+  const T* out_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr out_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
+    out_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                   .Allocate(numel * sizeof(T));
+    auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_out(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
+    for_range_out(functor_out);
+
+    x_data_deal = x_data_conj;
+    out_data_deal = out_data_conj;
+  } else {
+    x_data_deal = x_data;
+    out_data_deal = out_data;
+  }
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t k = 0; k < inner_dim; k++) {
+      for (size_t j = 0; j < mid_dim; j++) {
+        size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
+        d_x_data[index] = 0;
+        for (size_t n = 0; n < mid_dim; n++) {
+          size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
+          T elem;
+          if (j == 0) {
+            elem = d_out_data[pos];
+          } else {
+            elem = d_out_data[pos] * out_data_deal[index - inner_dim];
+          }
+          if (pos > index) {
+            for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
+              elem *= x_data_deal[m];
+            }
+          } else if (pos < index) {
+            elem = static_cast<T>(0);
+          }
+          d_x_data[index] += elem;
+        }
+      }
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(cumprod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aea338027f5bb983788c382982dd2e1ad8db5e9a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include <cstdint>
+#include <type_traits>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int dim,
+                   DenseTensor* out) {
+  const DenseTensor* x = &input;
+  auto* x_data = x->data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  DDim shape = x->dims();
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t j = 0; j < mid_dim; j++) {
+      for (size_t k = 0; k < inner_dim; k++) {
+        size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
+        if (j == 0) {
+          out_data[pos] = x_data[pos];
+        } else {
+          out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/cumsum_kernel.cc b/paddle/phi/kernels/cpu/cumsum_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d32e18479aae924cc06eee8daaa760226ddd6980
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumsum_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+struct CumsumFunctor {
+  template <typename X>
+  const typename X::TensorScanSumOp operator()(X x,
+                                               int axis,
+                                               bool exclusive) const {
+    return x.cumsum(axis, exclusive);
+  }
+};
+
+template <typename Device, typename Dim, typename X, typename Out>
+void ComputeImp(Device d,
+                const Dim& dims,
+                X x,
+                Out out,
+                int axis,
+                bool reverse,
+                bool exclusive) {
+  if (!reverse) {
+    out.reshape(dims).device(d) =
+        CumsumFunctor()(x.reshape(dims), axis, exclusive);
+  } else {
+    std::array<bool, Dim::count> rev;
+    rev.fill(false);
+    rev[axis] = reverse;
+    out.reshape(dims).device(d) =
+        CumsumFunctor()(x.reshape(dims).reverse(rev), axis, exclusive)
+            .reverse(rev);
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  auto out_dims = out->dims();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  int pre = 1;
+  int post = 1;
+  int mid = out_dims[axis];
+  for (int i = 0; i < axis; ++i) {
+    pre *= out_dims[i];
+  }
+  for (int i = axis + 1; i < out_dims.size(); ++i) {
+    post *= out_dims[i];
+  }
+
+  auto x0 = EigenVector<T>::Flatten(x);
+  auto out0 = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  using IndexT = Eigen::DenseIndex;
+  if (pre == 1) {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 1>(mid),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive);
+    }
+  } else {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(pre, mid),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 3>(pre, mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c56b225e2a753f963651f5e3f0a5cf711f5bb8a6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad) {
+  T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* dout_data = out_grad.data<T>();
+  auto dx_dims = x_grad->dims();
+  auto dout_dims = out_grad.dims();
+
+  if (dx_dims.size() == 1) {
+    auto dx_length = dx_dims[0];
+    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+
+    auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+    auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
+    dout_data +=
+        (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
+
+    for (int i = 0; i < dx_length; i++) {
+      dx_data[i * dx_stride] = dout_data[i * (dout_stride_0 + dout_stride_1)];
+    }
+  } else {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, x_grad, static_cast<T>(0));
+
+    int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims);
+    auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+    dx_data += (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0);
+
+    auto dout_length = dout_dims[0];
+    for (int i = 0; i < dout_length; i++) {
+      dx_data[i * (dx_stride_0 + dx_stride_1)] = dout_data[i * dout_stride_0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(diag_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagGradKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
index d1e0b8e31e78fd74e6a15722546971a3cb72807a..4b060f0372a5bf50d9378239dae635e5723d0c7a 100644
--- a/paddle/phi/kernels/cpu/diag_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -62,5 +62,12 @@ void DiagKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {}
+PD_REGISTER_KERNEL(diag,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagKernel,
+                   phi::dtype::float16,
+                   int,
+                   float,
+                   double,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b7f8f98f9473c03e26d6edaebb7dd04e0428072
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/phi/kernels/cpu/dist_kernel.cc
similarity index 62%
rename from paddle/fluid/operators/reduce_ops/reduce_any_op.cu
rename to paddle/phi/kernels/cpu/dist_kernel.cc
index 2e93e67debbd9d7f8667e0b2994fdd440401ac13..ccf3d4be8323090985a5c7a4eaf0ed8efcfaf5de 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/phi/kernels/cpu/dist_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_any,
-    ops::ReduceCudaKernel<bool, kps::LogicalOrFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b77a6c55b14716e2747a2cb76d4b1bda380a2d02
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  auto* grad_x = x_grad;
+  auto* grad_y = &out_grad;
+  grad_x->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto dX = EigenVector<T>::Flatten(*grad_x);
+  auto dY = EigenVector<T>::Flatten(*grad_y);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto& dropout_implementation = mode;
+  if (is_test == true) {
+    if (dropout_implementation == "upscale_in_train") {
+      dX.device(place) = static_cast<T>(1) * dY;
+    } else {
+      dX.device(place) = dY * static_cast<T>(1.0f - p);
+    }
+  } else {
+    auto M = EigenVector<uint8_t>::Flatten(mask);
+    if (dropout_implementation == "upscale_in_train") {
+      if (p == 1.0f) {
+        dX.device(place) = static_cast<T>(0) * dY;
+      } else {
+        dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+      }
+    } else {
+      dX.device(place) = dY * M.cast<T>();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c00aedef8c67d5b88efb29c746c70f5f7859507a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_kernel.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  auto* y = out;
+  const auto* x_data = x.data<T>();
+  auto* y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+
+  auto& dropout_implementation = mode;
+  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+  if (!is_test) {
+    auto* mask_data = mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+    size_t size = phi::product(mask->dims());
+
+    // Special case when dropout_prob is 1.0
+    if (dropout_prob == 1.0f) {
+      std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
+      std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
+      return;
+    }
+    // std::minstd_rand engine;
+    // NOTE: fixed seed should only be used in unittest or for debug.
+    // Guarantee to use random seed in training.
+    int seed_data = 0;
+    if (seed_tensor.get_ptr() != nullptr) {
+      seed_data = *(seed_tensor->data<int>());
+    } else {
+      seed_data = fix_seed ? seed : 0;
+    }
+    auto engine = paddle::framework::GetCPURandomEngine(seed_data);
+
+    std::uniform_real_distribution<float> dist(0, 1);
+
+    for (size_t i = 0; i < size; ++i) {
+      if (dist(*engine) < dropout_prob) {
+        mask_data[i] = 0;
+        y_data[i] = 0;
+      } else {
+        mask_data[i] = 1;
+        if (upscale_in_train) {
+          y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+        } else {
+          y_data[i] = x_data[i];
+        }
+      }
+    }
+  } else {
+    if (upscale_in_train) {
+      const auto* X_data = x.data<T>();
+      auto* Y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < x.numel(); i++) {
+        Y_data[i] = X_data[i];
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place = *dev_ctx.eigen_device();
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5135778db56c5a59b8d1e2cebf49ba2595070f7f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92fd20ca9b8251a3ba1b493f8cd35a803e20dfba
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index e48ee805959088aa8a6a3da7fcb8fa02b642c1c8..bf6ec012b24443e877b235e17488725dc0d14151 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -18,7 +18,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
@@ -108,6 +107,34 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -171,3 +198,81 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(elementwise_fmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37ad18df56ec30c838dd5bd03c484d7889e976c0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+PD_REGISTER_KERNEL(elementwise_fmax,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/arg_max_op.cu
rename to paddle/phi/kernels/cpu/erf_grad_kernel.cc
index 14708c4df10f5160d0e72e7669e0015554d8215f..3c1cd0df1531a50524958b527ee39b09460f042c 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
@@ -1,22 +1,27 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(erf_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/erf_kernel.cc b/paddle/phi/kernels/cpu/erf_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05ce4cab7fcef4d1da7d378093f9f3d04827acd2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/erf_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, CPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6eafe9aa49dfe820881ca1394716a29e0ced4ec4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..697ea138097ee9d57f23a11d7403b2d4b78158b6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 86576a861aa4834a4b39b50594565a2d4b3ac510..556de3adcf498a111bcaad2f5b849bfe79be8adf 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -54,12 +54,22 @@ void FullLikeKernel(const Context& dev_ctx,
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b375a7ec4691c723f2f029c39b7e364b8332c402
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto &place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::ScatterNdAdd<T, int32_t>(ctx, out_grad, index, x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::ScatterNdAdd<T, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa32d036934e838b7630a19a152e0c14de907253
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_nd_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s]",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::CPUGatherNd<T, int>(ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::CPUGatherNd<T, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e336f18bf80a36d1c954533aa7dc2534c4f7f2c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  std::normal_distribution<T> dist(mean, std);
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+  int64_t size = tensor->numel();
+  T* data = dev_ctx.template Alloc<T>(tensor);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..df6d9c87be0ed582568684aaa0869342f10e47c7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct GraphSendRecvSumFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    eigen_dst += eigen_src;
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMinFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMin(eigen_src);
+    }
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMaxFunctor {
+  void operator()(const int& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMax(eigen_src);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+void ElementwiseInnerOperation(const DenseTensor& src,
+                               DenseTensor* dst,
+                               const IndexT& src_index,
+                               const IndexT& dst_index,
+                               const bool& first_flag,
+                               Functor functor) {
+  auto src_slice = src.Slice(src_index, src_index + 1);
+  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
+
+  functor(first_flag, src_slice, &dst_slice);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8538461b1b83b887223224bac231600221547f11
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuGradLoop(const int& input_size,
+                              const int& index_size,
+                              const IndexT* s_index,
+                              const IndexT* d_index,
+                              const DenseTensor& src,
+                              DenseTensor* dst,
+                              const std::string& pool_type,
+                              const int* dst_count = nullptr,
+                              const DenseTensor* input = nullptr,
+                              const DenseTensor* output = nullptr) {
+  if (pool_type == "SUM") {
+    Functor functor;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      auto src_slice = src.Slice(src_idx, src_idx + 1);
+      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& forward_src_idx = d_index[i];
+      const IndexT& forward_dst_idx = s_index[i];
+      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
+      auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
+
+      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += eigen_src * (eigen_output == eigen_input);
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+  } else if (pool_type == "MEAN") {
+    const int* s_count = dst_count->data<int>();
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    s_count);
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    nullptr,
+                                                                    x,
+                                                                    out);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fecbd4b1d7aa05c94d2cb713f302abd6f2aeb1c4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <set>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuLoop(const int& input_size,
+                          const int& index_size,
+                          const IndexT* s_index,
+                          const IndexT* d_index,
+                          const DenseTensor& src,
+                          DenseTensor* dst,
+                          const std::string& pool_type,
+                          int* dst_count = nullptr) {
+  Functor functor;
+  if (pool_type == "SUM") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+    for (int i = 0; i < index_size; ++i) {
+      IndexT dst_idx = d_index[i];
+      *(dst_count + dst_idx) += 1;
+    }
+    for (int i = 0; i < input_size; ++i) {
+      if (*(dst_count + i) == 0) continue;
+      auto dst_slice = dst->Slice(i, i + 1);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
+      } else {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
+      }
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
+                                       const DenseTensor& x,
+                                       const DenseTensor& src_index,
+                                       const DenseTensor& dst_index,
+                                       const std::string& pool_type,
+                                       DenseTensor* out,
+                                       DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MEAN") {
+    ctx.template Alloc<int>(dst_count);
+    int* p_dst_count = dst_count->data<int>();
+    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                index_size,
+                                                                s_index,
+                                                                d_index,
+                                                                x,
+                                                                out,
+                                                                pool_type,
+                                                                p_dst_count);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..633c6ba093e42762e3d5b64415d6098c3add6b8a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33a7429a22a1a84f5a3c372fb5530092e761ed9e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9399d38d711f56305641c9f3170306bacdd6095
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, CPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c462b8ec32c89dfcf2657018baf9b13764f2858e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+
+namespace phi {}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    kldiv_loss, CPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/kron_grad_kernel.cc b/paddle/phi/kernels/cpu/kron_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01f5e5404b61d3ac96ffd6a811e449eae260c27d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kron_grad_kernel.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KronGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/kron_kernel.cc b/paddle/phi/kernels/cpu/kron_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aaea509dc7641b8b6c44b031c44e2b210c0cde39
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kron_kernel.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KronKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..116fa3f8d3f6a91ec0705b92ff65aa2a411f4f23
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, CPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d02268940892e3db932692184343807ebe10c1cf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+struct LgammaFunctor {
+  LgammaFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = Eigen::numext::lgamma(input_[idx]);
+  }
+
+ private:
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, CPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b8b7f7a2e05c741358bac852770c0e3cbaf49ed
--- /dev/null
+++ b/paddle/phi/kernels/cpu/linspace_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  int32_t num = number.data<int32_t>()[0];
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  T start_data = start_t.template data<T>()[0];
+  T stop_data = stop_t.template data<T>()[0];
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (num > 1) {
+    // step should be of double type for all types
+    double step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int half_num = num / 2;
+    for (int i = 0; i < num; ++i) {
+      if (i < half_num) {
+        out_data[i] = static_cast<T>(start_data + step * i);
+      } else {
+        out_data[i] = static_cast<T>(stop_data - step * (num - i - 1));
+      }
+    }
+  } else {
+    out_data[0] = static_cast<T>(start_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e2d94df59eeb8c2eb2ec2a7b5487e48566fbef8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    log_loss_grad, CPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/log_loss_kernel.cc b/paddle/phi/kernels/cpu/log_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38e93486f7bf882a468f2cebb7f6c5757eadd512
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_loss_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(log_loss, CPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3b4d2b45582b84d81a2a57865c6cc287f86535
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
similarity index 58%
rename from paddle/fluid/operators/optimizers/adadelta_op.cu
rename to paddle/phi/kernels/cpu/matrix_power_kernel.cc
index 562a157f063b44d65254d556d44439eee3636c4c..f40e1e616f526262eee2a50b319935e7ab160bee 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e13abe8aed2caf205871a24cfddff0b8b959498
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out) {
+  DenseTensor atol_tensor;
+  if (use_default_tol) {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(0));
+  } else {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(tol));
+  }
+  MatrixRankTolKernel<T, Context>(
+      dev_ctx, x, atol_tensor, use_default_tol, hermitian, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    matrix_rank, CPU, ALL_LAYOUT, phi::MatrixRankKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70b6316e1044426a0743c8d5251ca7d210956356
--- /dev/null
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include <Eigen/Dense>
+#include <Eigen/SVD>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void BatchEigenvalues(const T* x_data,
+                      T* eigenvalues_data,
+                      int batches,
+                      int rows,
+                      int cols,
+                      int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, rows);
+    Eigen::SelfAdjointEigenSolver<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+        eigen_solver(m);
+    auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs();
+    for (int j = 0; j < k; j++) {
+      *(eigenvalues_data + i * k + j) = eigenvalues[j];
+    }
+  }
+}
+
+template <typename T>
+void BatchSVD(const T* x_data,
+              T* eigenvalues_data,
+              int batches,
+              int rows,
+              int cols,
+              int k) {
+  // Eigen::Matrix API need non-const pointer.
+  T* input = const_cast<T*>(x_data);
+  int stride = rows * cols;
+  Eigen::BDCSVD<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      svd;
+  for (int i = 0; i < batches; i++) {
+    auto m = Eigen::Map<
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
+        input + i * stride, rows, cols);
+    svd.compute(m);
+    auto res_s = svd.singularValues();
+    for (int j = 0; j < k; j++) {
+      eigenvalues_data[i * k + j] = res_s[j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  dev_ctx.template Alloc<int64_t>(out);
+  auto dim_x = x.dims();
+  auto dim_out = out->dims();
+  int rows = dim_x[dim_x.size() - 2];
+  int cols = dim_x[dim_x.size() - 1];
+  int k = std::min(rows, cols);
+  auto numel = x.numel();
+  int batches = numel / (rows * cols);
+
+  T rtol_T = 0;
+
+  if (use_default_tol) {
+    rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+  }
+
+  DenseTensor eigenvalue_tensor;
+  eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k));
+  auto* eigenvalue_data = dev_ctx.template Alloc<T>(&eigenvalue_tensor);
+
+  if (hermitian) {
+    BatchEigenvalues<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+  } else {
+    BatchSVD<T>(x_data, eigenvalue_data, batches, rows, cols, k);
+  }
+
+  DenseTensor max_eigenvalue_tensor;
+  max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims()));
+  dev_ctx.template Alloc<T>(&max_eigenvalue_tensor);
+  phi::MaxKernel<T, Context>(dev_ctx,
+                             eigenvalue_tensor,
+                             std::vector<int64_t>{-1},
+                             false,
+                             &max_eigenvalue_tensor);
+
+  DenseTensor temp_rtol_tensor;
+  temp_rtol_tensor =
+      phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(rtol_T));
+
+  DenseTensor rtol_tensor =
+      phi::Multiply<T>(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor);
+
+  DenseTensor tol_tensor;
+  tol_tensor.Resize(dim_out);
+  dev_ctx.template Alloc<T>(&tol_tensor);
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+      dev_ctx,
+      atol_tensor,
+      rtol_tensor,
+      -1,
+      GreaterElementFunctor<T>(),
+      &tol_tensor);
+
+  tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+  DenseTensor compare_result;
+  compare_result.Resize(detail::NewAxisDim(dim_out, k));
+  dev_ctx.template Alloc<int64_t>(&compare_result);
+  int axis = -1;
+  if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) {
+    funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int>(
+        dev_ctx,
+        eigenvalue_tensor,
+        tol_tensor,
+        axis,
+        funcs::GreaterThanFunctor<T, int64_t>(),
+        &compare_result);
+  } else {
+    funcs::ElementwiseCompute<funcs::LessThanFunctor<T, int64_t>, T, int>(
+        dev_ctx,
+        eigenvalue_tensor,
+        tol_tensor,
+        axis,
+        funcs::LessThanFunctor<T, int64_t>(),
+        &compare_result);
+  }
+
+  phi::SumKernel<int64_t>(dev_ctx,
+                          compare_result,
+                          std::vector<int64_t>{-1},
+                          compare_result.dtype(),
+                          false,
+                          out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    matrix_rank_tol, CPU, ALL_LAYOUT, phi::MatrixRankTolKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..429344a362b1c3215f6019b70941db0255e304f3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/phi/kernels/cpu/maxout_kernel.cc
similarity index 69%
rename from paddle/fluid/operators/reduce_ops/reduce_all_op.cu
rename to paddle/phi/kernels/cpu/maxout_kernel.cc
index a1f1a228aeb3a20807059a306a2fbff22d4a0bb8..e7cd3ab07ff598230338fcb1a3804952a000d14d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/phi/kernels/cpu/maxout_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_all,
-    ops::ReduceCudaKernel<bool, kps::LogicalAndFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cd75404be821ce1f2303237fcb92cae53ee25d1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
similarity index 59%
rename from paddle/fluid/operators/optimizers/adamax_op.cu
rename to paddle/phi/kernels/cpu/multi_dot_kernel.cc
index 80e0219d4414db2909b5babc22599d8c0d906c7d..a4249a98e46dde19ea0bf91bee73ba7a0d425f28 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7d74759f516ac1d2ba9538a136d9856c6c7ddb1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T>
+static void nll_loss_grad_1D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const std::string reduction,
+                             const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        continue;
+      }
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      continue;
+    }
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
+    if (reduction == "mean") {
+      dx_data[i * n_classes + cur_label] /= total_weight_val;
+    }
+  }
+}
+
+template <typename T>
+static void nll_loss_grad_2D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const int64_t in_dim2,
+                             const int64_t in_dim3,
+                             const std::string& reduction,
+                             const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            continue;
+          }
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
+              -cur_weight * dout_data[index];
+        }
+      }
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          continue;
+        }
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        const auto dx_index =
+            i * sample_size + cur_label * map_size + h * in_dim3 + w;
+        dx_data[dx_index] = -dout_val * cur_weight;
+        if (reduction == "mean") {
+          dx_data[dx_index] /= total_weight_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = d_out.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+  memset(dx_data, 0, dx->numel() * sizeof(T));
+
+  const auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_grad_1D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     reduction,
+                     ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_grad_2D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     in_dim2,
+                     in_dim3,
+                     reduction,
+                     ignore_index);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, CPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..334b0082bde57fddff67aaeedee5de040ebf7c85
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+static void nll_loss_1D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int64_t i = 0; i < batch_size; ++i) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        out_data[i] = 0;
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes,
+                            cur_label));
+
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      out_data[i] = 0;
+      continue;
+    }
+    PADDLE_ENFORCE_EQ(
+        cur_label >= 0 && cur_label < n_classes,
+        true,
+        phi::errors::InvalidArgument("label should not be out of bounds."));
+
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    total_weight_val += cur_weight;
+    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
+  }
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T>
+static void nll_loss_2D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const int64_t in_dim2,
+                        const int64_t in_dim3,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            out_data[index] = 0;
+            continue;
+          }
+          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                            true,
+                            phi::errors::InvalidArgument(
+                                "label should not be out of bounds."));
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
+                                    h * in_dim3 + w] *
+                            cur_weight;
+        }
+      }
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          out_data[index] = 0;
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(
+            cur_label >= 0 && cur_label < n_classes,
+            true,
+            phi::errors::InvalidArgument("label should not be out of bounds."));
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        total_weight_val += cur_weight;
+        output_val -=
+            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
+            cur_weight;
+      }
+    }
+  }
+
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& labels,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto x_data = x.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  *total_weight_data = 0;
+
+  auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_1D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   reduction,
+                   ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_2D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   in_dim2,
+                   in_dim3,
+                   reduction,
+                   ignore_index);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, CPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index 597207a05a226ac598d9141b42d5682bed5364f1..bd05e2c4c6ec1759b55a6f4fb0e7f83d1aeac954 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -26,9 +26,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& norm,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/phi/kernels/cpu/one_hot_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/one_hot_v2_op.h
rename to paddle/phi/kernels/cpu/one_hot_kernel.cc
index 9d42c5875bb6eecd1244ca1a0dd6442985ec2a02..dc58489ebf70eaaa7efba52775f7ac62bb2ef5b2 100644
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,23 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
 template <typename DeviceContext, typename InT>
 struct OneHotV2OpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
+  const DenseTensor* in_;
+  DenseTensor* out_;
   int depth_;
   const DeviceContext& ctx_;
   bool allow_out_of_range_;
 
-  OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                    int depth, const DeviceContext& ctx,
+  OneHotV2OpFunctor(const DenseTensor* in,
+                    DenseTensor* out,
+                    int depth,
+                    const DeviceContext& ctx,
                     bool allow_out_of_range = false)
       : in_(in),
         out_(out),
@@ -40,8 +42,8 @@ struct OneHotV2OpFunctor {
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    phi::funcs::set_constant(ctx_, out_, 0.0);
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
@@ -52,51 +54,46 @@ struct OneHotV2OpFunctor {
     } else {
       for (int i = 0; i < numel; ++i) {
         PADDLE_ENFORCE_GE(
-            p_in_data[i], 0,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            0,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be at least 0, "
                 "but received input (%d) less than 0",
                 p_in_data[i]));
         PADDLE_ENFORCE_LT(
-            p_in_data[i], depth_,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            depth_,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be less than "
                 "Input(depth), "
                 "but received input (%d) not less than depth (%d)",
-                p_in_data[i], depth_));
+                p_in_data[i],
+                depth_));
         *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
       }
     }
   }
 };
 
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class OneHotV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>(),
-            allow_out_of_range));
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
   }
-};
 
-}  // namespace operators
-}  // namespace paddle
+  phi::VisitDataType(dtype,
+                     OneHotV2OpFunctor<Context, T>(
+                         &x, out, depth, dev_ctx, allow_out_of_range));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, CPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/pad_grad_kernel.cc b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67e6da7d0e06a572d6d1b5f5353f3fdecc122eaa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/pad_kernel.cc b/paddle/phi/kernels/cpu/pad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4a0acdcca267050b04f8063147c140e5631e27b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb97694d8fc38d92f5290894a2c45dd21e7b1717
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(
+    pool3d_grad, CPU, ALL_LAYOUT, phi::Pool3dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d57e282c3c8ae85573bf11eff43e6551a808ea0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pool_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d, CPU, ALL_LAYOUT, phi::Pool3dKernel, float, double) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbed3f1cb133ada68b90a5283fc182373488c565
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  if (dx) {
+    auto in_dims = x.dims();
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num_t = rois.dims()[0];
+
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      auto* rois_num_t_data = rois_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_t_data[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_t_data[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+    const T* input_rois = rois.data<T>();
+    const T* dout_data = dout.data<T>();
+    T* dx_data = ctx.template Alloc<T>(dx);
+
+    // set gradient of X to be 0. before backpropagate.
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    // backpropagate gradient per output pixel
+    int dout_size = dout.numel();
+    for (int i = 0; i < dout_size; ++i) {
+      // The output is in order (n, c, ph, pw)
+      int pw = i % pooled_width;
+      int ph = (i / pooled_width) % pooled_height;
+      int c = (i / pooled_width / pooled_height) % output_channels;
+      int n = i / pooled_width / pooled_height / output_channels;
+
+      // set roi_batch_id
+      int roi_batch_id = rois_batch_id_data[n];
+      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      int input_offset =
+          (roi_batch_id * input_channels + input_channel) * height * width;
+      T* offset_dx_data = dx_data + input_offset;
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+      int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+      int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+      int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+      // Add roi offsets and clip to input boundaries
+      hstart = std::min(std::max(hstart, 0), height);
+      hend = std::min(std::max(hend, 0), height);
+      wstart = std::min(std::max(wstart, 0), width);
+      wend = std::min(std::max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Accumulate diff_val into input data
+      T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+      T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+      for (int ih = hstart; ih < hend; ++ih) {
+        for (int iw = wstart; iw < wend; ++iw) {
+          int input_index = ih * width + iw;
+          offset_dx_data[input_index] += diff_val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, CPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06cd03395d9656614995ef0ad91dad04b27717bf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num_t = rois.dims()[0];
+
+  PADDLE_ENFORCE_EQ(input_channels,
+                    output_channels * pooled_height * pooled_width,
+                    errors::InvalidArgument(
+                        "the channels of input "
+                        "X should equal the product of "
+                        "output_channels x pooled_height x pooled_width"));
+
+  auto in_stride = stride(in_dims);
+  auto out_stride = stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template Alloc<int>(&rois_batch_id_list);
+
+  int rois_batch_size;
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            rois_batch_size,
+            batch_size));
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_data[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_data[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_data[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument("the rois_batch_size and input(X) "
+                                "batch_size should be the same."));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod,
+                      rois_num_t,
+                      errors::InvalidArgument(
+                          "the rois_num from input and lod must be the same"));
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = ctx.template Alloc<T>(out);
+  const T* input_rois = rois.data<T>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num_t; ++n) {
+    // set roi batch id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+    // Force too small rois to be 1 x 1
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute bin size w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // calculate each pixel of the output feature map.
+    int out_roi_offset = n * out_stride[0];
+    for (int c = 0; c < output_channels; ++c) {
+      // per category
+      int out_plane_offset = out_roi_offset + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        int out_row_offset = out_plane_offset + ph * out_stride[2];
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          // calculate w and h at input feature map
+          int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+          int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+          int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+          int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+          //  Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, 0), height);
+          wstart = std::min(std::max(wstart, 0), width);
+          hend = std::min(std::max(hend, 0), height);
+          wend = std::min(std::max(wend, 0), width);
+
+          int output_index = out_row_offset + pw;
+          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          int input_plane_offset =
+              roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+          const T* offset_input_data = input_data + input_plane_offset;
+          T out_sum = 0.;
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          for (int ih = hstart; ih < hend; ++ih) {
+            for (int iw = wstart; iw < wend; ++iw) {
+              int input_index = ih * in_stride[2] + iw;
+              out_sum += offset_input_data[input_index];
+            }
+          }
+          T bin_area = (hend - hstart) * (wend - wstart);
+          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, CPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e94d09e0337f27df2ee228b0b10e817a6f192803
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int32_t>(
+          // Here passing an unused argument out_grad, because it's
+          // convenient to instantiate a bunch of template function with the
+          // same arguments list.
+          out_grad,
+          axis,
+          index,
+          *x_grad,
+          dev_ctx);
+    } else {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_gather_kernel<T, int32_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83c9a915ee6357c4462f64b1e193e546973560ce
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce "
+        "op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 4e268d40038cfb56b1e772e14b0ed7699f9700dd..af67bdf5d624f33fd4ec06db425ec8312b490642 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -239,4 +239,29 @@ void Reduce(const DeviceContext& dev_ctx,
   }
 }
 
+template <typename DeviceContext, typename OutT, typename Functor>
+void BoolReduceKernel(const DeviceContext& dev_ctx,
+                      const phi::DenseTensor& input,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      phi::DenseTensor* output) {
+  dev_ctx.template Alloc<OutT>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = input.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+
+  ReduceKernelImpl<DeviceContext, bool, OutT, Functor>(
+      dev_ctx, input, output, dims, keep_dim, reduce_all);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_all_kernel.cc b/paddle/phi/kernels/cpu/reduce_all_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e8e38ee4447e67359e694700504c1041d0a15e7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_all_kernel.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_all_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
diff --git a/paddle/phi/kernels/cpu/reduce_any_kernel.cc b/paddle/phi/kernels/cpu/reduce_any_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4fd71f1d0b169866376664bdf2b0b89b13c120e1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_any_kernel.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_any_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/cpu/reduce_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..f56d3d3ed50f7e72910115f7ec28914a5eade2e8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_grad.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
+
+namespace phi {
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* input1 = out.get_ptr();
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = x.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+  // NOTE: EigenTensor::From() uses tensor->data()
+  // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
+  // kNoNeedBufferY should set true
+  // and use fake var that has same dims.
+  if (kNoNeedBufferX) {
+    input0 = output;
+  }
+  if (kNoNeedBufferY) {
+    input1 = &input2;
+  }
+
+  const std::vector<int> const_dims{dims.begin(), dims.end()};
+
+  // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
+  // not be set as Input in grad Maker, use Out_grad to replace here
+  if (!input1) input1 = &input2;
+  Functor functor;
+
+  funcs::LaunchReduceGradKernel<Context, T, Functor>(dev_ctx,
+                                                     input0,
+                                                     input1,
+                                                     &input2,
+                                                     output,
+                                                     functor,
+                                                     const_dims,
+                                                     reduce_all);
+}
+
+template <typename Context,
+          typename T,
+          typename Functor,
+          bool kNoNeedBufferX = false,
+          bool kNoNeedBufferY = false>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const paddle::optional<DenseTensor>& out,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  if (in_dtype != DataType::UNDEFINED) {
+    DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+    DenseTensor x_grad_tmp =
+        phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        &x_grad_tmp);
+
+    phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+  } else {
+    ComputeFromInput<Context, T, Functor, kNoNeedBufferX, kNoNeedBufferY>(
+        dev_ctx,
+        x,
+        out_grad,
+        out,
+        out_grad,
+        dims,
+        keep_dim,
+        reduce_all,
+        in_dtype,
+        out_dtype,
+        x_grad);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9ea0aa0faf06918253f9037282b924199e3a314
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_min_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a241c81dbe690493b00caf71c0526bb76206e5e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_min_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_min_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
index cf0179124ebdfcb58a2ac3436fcbd4d5347bb6f2..9a9bf46e948bc52c6a1e9679b4d3e51b10d89e6d 100644
--- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reduce_prod_kernel.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efea054555e86be79b5cdb09fe8c4784a1ad0c3b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/cpu/reduce_grad.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+namespace phi {
+
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+template <typename T, typename Context>
+void ComputeFromInput(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& input2,
+                      const std::vector<int64_t>& dims,
+                      DenseTensor* x_grad) {
+  auto* input0 = &x;
+  auto* output = x_grad;
+  dev_ctx.template Alloc<T>(output);
+
+  const auto* input2_d = input2.data<T>();
+  auto* output_d = output->data<T>();
+
+  // handle reduce_all
+  if (input2.dims().size() == 1 && input2.dims()[0] == 1) {
+    for (int64_t i = 0; i < phi::product(input0->dims()); ++i) {
+      output_d[i] = input2_d[0];
+    }
+    return;
+  }
+
+  // handle reduce by one dimension
+  int reduce_dim_index = dims[0];
+  if (reduce_dim_index < 0) {
+    reduce_dim_index += input0->dims().size();
+  }
+
+  auto& input_dim = input0->dims();
+  int64_t before_dim = 1;
+  for (int i = 0; i < reduce_dim_index; ++i) {
+    before_dim *= input_dim[i];
+  }
+  int64_t reduce_dim = input_dim[reduce_dim_index];
+  int64_t after_dim = 1;
+  for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+    after_dim *= input_dim[i];
+  }
+  for (int64_t i = 0; i < before_dim; ++i) {
+    for (int64_t j = 0; j < reduce_dim; ++j) {
+      for (int64_t k = 0; k < after_dim; ++k) {
+        output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+            input2_d[i * after_dim + k];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  if (dims.size() == 1) {
+    if (out_dtype != DataType::UNDEFINED) {
+      DenseTensorMeta x_grad_meta(out_dtype, x_grad->dims(), x_grad->layout());
+      DenseTensor x_grad_tmp =
+          phi::Empty<Context>(dev_ctx, std::move(x_grad_meta));
+
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, &x_grad_tmp);
+
+      phi::CastKernel<T>(dev_ctx, x_grad_tmp, in_dtype, x_grad);
+
+    } else {
+      ComputeFromInput<T, Context>(dev_ctx, x, out_grad, dims, x_grad);
+    }
+  }
+
+  ReduceGradKernel<Context, T, SumGradFunctor, true>(dev_ctx,
+                                                     x,
+                                                     out_grad,
+                                                     paddle::none,
+                                                     dims,
+                                                     keep_dim,
+                                                     reduce_all,
+                                                     in_dtype,
+                                                     out_dtype,
+                                                     x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35ab99a98eba7e59853fb311d5b2307b69ae31b2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+constexpr size_t GetOffset(size_t x, size_t y, size_t width) {
+  return y * width + x;
+}
+
+template <class T>
+struct OffsetsAndRatios {
+  OffsetsAndRatios() = default;
+  OffsetsAndRatios(std::size_t xy,
+                   std::size_t xY,
+                   std::size_t Xy,
+                   std::size_t XY,
+                   T xy_ratio,
+                   T xY_ratio,
+                   T Xy_ratio,
+                   T XY_ratio)
+      : xy(xy),
+        xY(xY),
+        Xy(Xy),
+        XY(XY),
+        xy_ratio(xy_ratio),
+        xY_ratio(xY_ratio),
+        Xy_ratio(Xy_ratio),
+        XY_ratio(XY_ratio) {}
+
+  std::size_t xy = 0;
+  std::size_t xY = 0;
+  std::size_t Xy = 0;
+  std::size_t XY = 0;
+  T xy_ratio = 0.0f;
+  T xY_ratio = 0.0f;
+  T Xy_ratio = 0.0f;
+  T XY_ratio = 0.0f;
+};
+
+template <typename T>
+std::vector<OffsetsAndRatios<T>> GetIndexesAndRatios(
+    std::size_t width,
+    std::size_t height,
+    const T roi_width,
+    const T roi_height,
+    const T roi_xmin,
+    const T roi_ymin,
+    std::size_t pooled_width,
+    std::size_t roi_bin_grid_w,
+    std::size_t pooled_height,
+    std::size_t roi_bin_grid_h) {
+  const auto ind_num =
+      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
+
+  std::vector<OffsetsAndRatios<T>> interpolation_cords;
+  interpolation_cords.reserve(ind_num);
+
+  const auto bin_w = roi_width / pooled_width;
+  const auto bin_h = roi_height / pooled_height;
+
+  for (std::size_t py = 0; py < pooled_height; py++) {
+    for (std::size_t px = 0; px < pooled_width; px++) {
+      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
+        // calculate x of sample points
+        auto y =
+            roi_ymin +
+            bin_h * (py +
+                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
+        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
+          // calculate x of sample points
+          auto x = roi_xmin +
+                   bin_w * (px +
+                            static_cast<T>(ix + .5f) /
+                                static_cast<T>(roi_bin_grid_w));
+
+          // deal with elements out of map
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            interpolation_cords.emplace_back();
+            continue;
+          }
+          y = y <= 0 ? 0 : y;
+          x = x <= 0 ? 0 : x;
+
+          std::size_t x_low_index = static_cast<std::size_t>(x);
+          std::size_t x_high_index;
+          if (x_low_index >= width - 1) {
+            x_high_index = x_low_index = width - 1;
+            x = static_cast<T>(x_low_index);
+          } else {
+            x_high_index = x_low_index + 1;
+          }
+          T x_ratio = x_high_index - x;
+
+          std::size_t y_low_index = static_cast<std::size_t>(y);
+          std::size_t y_high_index;
+          if (y_low_index >= height - 1) {
+            y_high_index = y_low_index = height - 1;
+            y = static_cast<T>(y_low_index);
+          } else {
+            y_high_index = y_low_index + 1;
+          }
+          T y_ratio = y_high_index - y;
+
+          auto xy = GetOffset(x_low_index, y_low_index, width);
+          auto xY = GetOffset(x_low_index, y_high_index, width);
+          auto Xy = GetOffset(x_high_index, y_low_index, width);
+          auto XY = GetOffset(x_high_index, y_high_index, width);
+
+          auto xy_ratio = x_ratio * y_ratio;
+          auto xY_ratio = x_ratio * (1 - y_ratio);
+          auto Xy_ratio = (1 - x_ratio) * y_ratio;
+          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
+
+          interpolation_cords.emplace_back(
+              xy, xY, Xy, XY, xy_ratio, xY_ratio, Xy_ratio, XY_ratio);
+        }
+      }
+    }
+  }
+  return interpolation_cords;
+}
+
+template <typename T>
+void Interpolate(std::vector<T>& interpolated_values,  // NOLINT
+                 const std::vector<OffsetsAndRatios<T>>& interpolation_cords,
+                 const T* data) {
+  for (auto& ic : interpolation_cords) {
+    auto xlyl_offset = ic.xy;
+    auto xhyl_offset = ic.Xy;
+    auto xlyh_offset = ic.xY;
+    auto xhyh_offset = ic.XY;
+
+    auto xlyl_ratio = ic.xy_ratio;
+    auto xhyl_ratio = ic.Xy_ratio;
+    auto xlyh_ratio = ic.xY_ratio;
+    auto xhyh_ratio = ic.XY_ratio;
+
+    interpolated_values.emplace_back(
+        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
+        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
+  }
+}
+
+template <typename T>
+void AvgPool(const std::vector<T>& interpolated_values,
+             T* output_data,
+             int roi_bin_grid_w,
+             int roi_bin_grid_h,
+             int pooled_width,
+             int pooled_height) {
+  const auto data_amount = pooled_width * pooled_height;
+  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
+  const T count = 1.0 / grid_points;
+  auto val_begin = interpolated_values.cbegin();
+  for (auto i = 0; i < data_amount; ++i) {
+    T sum = 0.0;
+    auto val_end = val_begin + grid_points;
+    sum = std::accumulate(val_begin, val_end, sum);
+    val_begin = val_end;
+    output_data[i] = sum * count;
+  }
+}
+
+template <typename T, typename Context>
+void ROIAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(in_dims);
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* roi_batch_id_data = roi_batch_id_list.data<int>();
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and the batch size of images "
+            " must be the same. But received the batch size of rois is %d, "
+            "and the batch size of images is %d",
+            boxes_batch_size,
+            batch_size));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(
+        lod.empty(),
+        false,
+        errors::InvalidArgument("Input(ROIs) Tensor of ROIAlignOp "
+                                "does not contain LoD information."));
+    auto boxes_lod = lod.back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The boxes_batch_size and imgs "
+            "batch_size must be the same. But received boxes_batch_size = %d, "
+            "batch_size = %d",
+            boxes_batch_size,
+            batch_size));
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        boxes_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+  }
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  const T* boxes_data = boxes.data<T>();
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int roi_batch_id = roi_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+
+    const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    auto interpolation_cords = GetIndexesAndRatios(width,
+                                                   height,
+                                                   roi_width,
+                                                   roi_height,
+                                                   roi_xmin,
+                                                   roi_ymin,
+                                                   pooled_width,
+                                                   roi_bin_grid_w,
+                                                   pooled_height,
+                                                   roi_bin_grid_h);
+
+    std::vector<T> interpolated_values;
+    interpolated_values.reserve(interpolation_cords.size());
+    for (auto channel = 0; channel < channels; ++channel) {
+      Interpolate(interpolated_values, interpolation_cords, batch_data);
+      AvgPool(interpolated_values,
+              output_data,
+              roi_bin_grid_w,
+              roi_bin_grid_h,
+              pooled_width,
+              pooled_height);
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      interpolated_values.clear();
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {}
diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62fd58704c4fef7c23cd8255d6958103b9755bff
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad) {
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s]",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (x_grad) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUScatterGradForX<T, int32_t>(ctx, index, x_grad);
+    } else {
+      phi::funcs::CPUScatterGradForX<T, int64_t>(ctx, index, x_grad);
+    }
+  }
+
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::CPUGather<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d48ceaf29a08c58de6f06746c36f2a8e8725852f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out) {
+  // In place output: Out = X, Out[Ids] = Updates
+  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  // Apply ScatterUpdate: Out[index] = Updates[:]
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s].",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+  if (overwrite) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterAssign<T, int32_t>(ctx, updates, index, out);
+    } else {
+      phi::funcs::ScatterAssign<T, int64_t>(ctx, updates, index, out);
+    }
+  } else {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterAssignAdd<T, int32_t>(ctx, updates, index, out);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int64_t>(ctx, updates, index, out);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    scatter, CPU, ALL_LAYOUT, phi::ScatterKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc143ba8d0e4557f8aaf07a4d4606bbf6c2b4d73
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad) {
+  if (x_grad) {
+    Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  }
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    const auto &index_type = index.dtype();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGatherNd<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::CPUGatherNd<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04ae10f5e8b5d551819a97ea1594140e535e6a12
--- /dev/null
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out) {
+  // In place output: Out = X
+  Copy(ctx, x, ctx.GetPlace(), true, out);
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::ScatterNdAdd<T, int32_t>(ctx, updates, index, out);
+  } else {
+    phi::funcs::ScatterNdAdd<T, int64_t>(ctx, updates, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/searchsorted_kernel.cc b/paddle/phi/kernels/cpu/searchsorted_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c036c2d438a36be779441f8f9aef78c0b5fbb642
--- /dev/null
+++ b/paddle/phi/kernels/cpu/searchsorted_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/searchsorted_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
+
+PD_REGISTER_KERNEL(searchsorted,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SearchsortedKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..585c27bdcec97e11a68cdc536c829f76c000a8df
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0413457f8177338aa450211539dc16d0880c74c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44df36bb9fd87320db8548815b68a431e46bbcac
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/set_value_kernel.cc b/paddle/phi/kernels/cpu/set_value_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcf278cd94e65189cd891124c4aa3ab81fa4397d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_kernel.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/shape_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073dc63b2a4348d4091af8c285f9ddebd799acc5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shape_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shape_kernel.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/shard_index_kernel.cc b/paddle/phi/kernels/cpu/shard_index_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a82bb8ce5929dcae46700dc4bd7013ab1fee0b39
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shard_index_kernel.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(
+      nshards,
+      0,
+      errors::InvalidArgument("The value 'nshard' for Op(shard_index) must be "
+                              "greater than 0, but the value given is %d.",
+                              nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  int shard_size = (index_num + nshards - 1) / nshards;
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    PADDLE_ENFORCE_GE(in_data[i],
+                      0,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be "
+                          "greater or equal to 0, but the value given is %d.",
+                          in_data[i]));
+    PADDLE_ENFORCE_LT(in_data[i],
+                      index_num,
+                      errors::InvalidArgument(
+                          "The input_index for Op(shard_index) must be less "
+                          "than index_num (%d), but the value given is %d.",
+                          index_num,
+                          in_data[i]));
+    if (in_data[i] / shard_size == shard_id) {
+      out_data[i] = in_data[i] % shard_size;
+    } else {
+      out_data[i] = ignore_value;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, CPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..468db18aa216714251c70ecd3d15b59897fb642d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(in_grad);
+
+  int limit = in_grad->numel();
+  auto x_data = x.data<T>();
+  auto label_data = label.data<T>();
+  auto dout_data = out_grad.data<T>();
+  for (int idx = 0; idx < limit; ++idx) {
+    T x = x_data[idx];
+    T label = label_data[idx];
+    T dout = dout_data[idx];
+    if (static_cast<int>(label) == ignore_index) {
+      dx_data[idx] = static_cast<T>(0.);
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+      T diff = simoid_x - label;
+      dx_data[idx] = dout * diff;
+    }
+  }
+  if (normalize) {
+    int norm = 0;
+    T eps = static_cast<T>(1e-6);
+    for (int idx = 0; idx < limit; ++idx) {
+      T diff = label_data[idx] - static_cast<T>(ignore_index);
+      if ((diff < -eps) || (diff > eps)) {
+        norm += 1;
+      }
+    }
+    eps = static_cast<T>(1e-5);
+    norm = norm > eps ? norm : eps;
+    std::for_each(dx_data, dx_data + limit, [norm](T& v) { v = v / norm; });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..366d300320b9fe599982467ba8979fd4f3e90cb9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out) {
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  int limit = out->numel();
+  auto x_data = x.data<T>();
+  auto label_data = label.data<T>();
+  for (int idx = 0; idx < limit; ++idx) {
+    T x = x_data[idx];
+    T label = label_data[idx];
+    if (static_cast<int>(label) == ignore_index) {
+      out_data[idx] = static_cast<T>(0.);
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+      out_data[idx] = term1 - term2 + term3;
+    }
+  }
+
+  if (normalize) {
+    int norm = 0;
+    T eps = static_cast<T>(1e-6);
+    for (int idx = 0; idx < limit; ++idx) {
+      T diff = label_data[idx] - static_cast<T>(ignore_index);
+      if ((diff < -eps) || (diff > eps)) {
+        norm += 1;
+      }
+    }
+    eps = static_cast<T>(1e-5);
+    norm = norm > eps ? norm : eps;
+    std::for_each(out_data, out_data + limit, [norm](T& v) { v = v / norm; });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc
index 537b4326681a175fbad7593eed1d8b6caee9d86c..1d28669571f8d095cf53355be26135360008b0ce 100644
--- a/paddle/phi/kernels/cpu/softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/softmax_kernel.cc
@@ -19,4 +19,4 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
-    softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {}
+    softmax, CPU, ALL_LAYOUT, phi::SoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 4acf9b02028f994c38144d716fdd56c6bbb6afa2..ea8e2702c19d6edd9f63d1da647db0ef07a417f1 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -28,6 +28,23 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
+  // need to infershape output
+  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
+    std::vector<MetaTensor> out_metas;
+    out_metas.reserve(outs.size());
+    std::vector<MetaTensor*> out_metas_ptr;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      out_metas.push_back(outs[i]);
+      out_metas_ptr.push_back(&out_metas.back());
+    }
+
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr);
+
+    for (size_t i = 0; i < out_metas.size(); ++i) {
+      outs[i]->Resize(out_metas[i].dims());
+    }
+  }
+
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4443383f40262ee3346a3c5e73b4d9dea7031966
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..502db8a22da0bc9ab475243b4d8f646be51ed9d5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/tile_grad_kernel.cc b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..636ade93742da11cf5e875adb27f72930c9a6686
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b590ed475aa2611d75a0426294cf5e8d30ab6fa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tile_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    tile, CPU, ALL_LAYOUT, phi::TileKernel, bool, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..582ee1157cce8bb7a9918464e124b5571b12f176
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  if (axis + 1 == in_dims.size()) {
+    // allocate the memory for the input_grad
+
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    FullTopKAssign(input_height,
+                   input_width,
+                   in_dims.size(),
+                   &out_grad,
+                   &indices,
+                   x_grad_data,
+                   k);
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    phi::DDim trans_dims(out_dims);
+    phi::DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    DenseTensor trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+
+    // Do transpose
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, out_grad, &trans_dO, trans);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, indices, &trans_ind, trans);
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    FullTopKAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out,
+                               k);
+
+    // Transpose back
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/top_k_kernel.cc b/paddle/phi/kernels/cpu/top_k_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ac16667ce2741dcdf0a2cea55b8c1c0a2366781
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_kernel.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     const int& k,
+                     const bool& largest,
+                     const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(),
+                    col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(),
+              col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // Get the top k elements of each row of input tensor
+  const auto& in_dims = input->dims();
+
+  // axis < 0, cacluate the real axis
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    auto out_dims = out->dims();
+    // accroding to axis to set K value in the dim
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  const auto& out_dims = out->dims();
+  if (axis + 1 == in_dims.size()) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         input,
+                         out_data,
+                         indices_data,
+                         k,
+                         largest,
+                         sorted);
+  } else {
+    // if the topk dims is not last dim, will tranpose and do topk
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    // get the trans input_dims, out_dims
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+
+    // transpose the input value
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, *input, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    // Allocate the temp tensor to the save the topk indices, values
+    DenseTensor tmp_out;
+    DenseTensor tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    tmp_indices.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    // get the TopK value
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         k,
+                         largest,
+                         sorted);
+    // transpose back
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    top_k, CPU, ALL_LAYOUT, phi::TopkKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80b2015f7318ad9a8b46c77460ca70a17f801a6d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c91e7475f5b7c4ea7c420eb72cccd8cd82b0aa0c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // Calculate use blas library
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    blas.TRSM(CblasLeft,
+              upper ? CblasUpper : CblasLower,
+              transpose ? CblasTrans : CblasNoTrans,
+              unitriangular ? CblasUnit : CblasNonUnit,
+              M,
+              N,
+              T(1),
+              x_bst_data + i * M * M,
+              std::max(1, M),
+              out_data + i * N * M,
+              std::max(1, N));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
index ebc032ef54538188d8e287673c0d31fae9ad197b..ab3d3c2376b8b05b4909f2c44df260299c2fe460 100644
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -27,7 +27,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
@@ -37,8 +37,13 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
 
   T* data = dev_ctx.template Alloc<T>(tensor);
 
-  std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                         1.0);
+  auto normal_cdf = [](float x) {
+    return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+  };
+  float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
+  float b_normal_cdf = normal_cdf((2.0 - mean) / std);
+  std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
+                                             2.0 * b_normal_cdf - 1.0);
   TruncatedNormal<T> truncated_normal(mean, std);
   int64_t size = tensor->numel();
 
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fab49f5416048a2655412056e5375b30fdaad923
--- /dev/null
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    int64_t height = pre * post;
+    int64_t width = n;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+// Reduce
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int64_t i = 0; i < height; ++i) {
+      int64_t h = i / post;
+      int64_t w = i % post;
+      IndType max_idx = -1;
+      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
+      for (int64_t j = 0; j < width; ++j) {
+        if (in_data[h * width * post + j * post + w] > max_value) {
+          max_value = in_data[h * width * post + j * post + w];
+          max_idx = j;
+        }
+      }
+      out_data[i] = max_value;
+      out_idx_data[i] = max_idx;
+    }
+  }
+};
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int end,
+                  int64_t scale) {
+    for (int i = 0; i < end; ++i) {
+      data[i] = i * scale;
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    auto input_ptr = input.data<T>();
+    auto num = input.numel();
+    *max_value = *std::max_element(input_ptr, input_ptr + num);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    funcs::SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(
+        lhs, rhs, mask);
+  }
+};
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    if (lhs.dims() == rhs.dims()) {
+      funcs::SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
+    } else {
+      bool is_multi_threads = false;
+#ifdef PADDLE_WITH_MKLML
+      if (omp_get_max_threads() > 1) {
+        is_multi_threads = true;
+      }
+#endif
+      if (is_multi_threads) {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(
+            lhs, rhs, output);
+      } else {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(
+            lhs, rhs, output);
+      }
+    }
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, CPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/where_index_kernel.cc b/paddle/phi/kernels/cpu/where_index_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da6eff74011eaaf5ac5c6d89091743be9a866a5b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_index_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct WhereIndexFunctor {
+  WhereIndexFunctor(
+      const T* true_index, int true_num, const T* stride, int rank, T* out)
+      : true_index_(true_index),
+        true_num_(true_num),
+        stride_(stride),
+        rank_(rank),
+        out_ptr_(out) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T index = true_index_[idx];
+    for (int j = 0; j < rank_; j++) {
+      out_ptr_[idx * rank_ + j] = index / stride_[j];
+      index -= out_ptr_[idx * rank_ + j] * stride_[j];
+    }
+  }
+
+  const T* true_index_;
+  int true_num_;
+  const T* stride_;
+  int rank_;
+  T* out_ptr_;
+};
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  auto true_num = true_index.size();
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto* out_ptr = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+
+  WhereIndexFunctor<int64_t> functor(
+      true_index.data(), true_num, stride.data(), rank, out_ptr);
+  phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx, true_num);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a83bc019fc3af395cedc20edd548b70149a915d5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  auto* imgsize = &img_size;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  const int stride = h * w;
+  const int an_stride = (class_num + 5) * stride;
+
+  DenseTensor anchors_;
+  auto anchors_data =
+      anchors_.mutable_data<int>({an_num * 2}, dev_ctx.GetPlace());
+  std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = imgsize->data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  memset(boxes_data, 0, boxes->numel() * sizeof(T));
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  memset(scores_data, 0, scores->numel() * sizeof(T));
+
+  T box[4];
+  for (int i = 0; i < n; i++) {
+    int img_height = imgsize_data[2 * i];
+    int img_width = imgsize_data[2 * i + 1];
+
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          int obj_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 4, iou_aware);
+          T conf = funcs::sigmoid<T>(input_data[obj_idx]);
+          if (iou_aware) {
+            int iou_idx =
+                funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+            T iou = funcs::sigmoid<T>(input_data[iou_idx]);
+            conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                   pow(iou, static_cast<T>(iou_aware_factor));
+          }
+          if (conf < conf_thresh) {
+            continue;
+          }
+
+          int box_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 0, iou_aware);
+          funcs::GetYoloBox<T>(box,
+                               input_data,
+                               anchors_data,
+                               l,
+                               k,
+                               j,
+                               h,
+                               w,
+                               input_size_h,
+                               input_size_w,
+                               box_idx,
+                               stride,
+                               img_height,
+                               img_width,
+                               scale,
+                               bias);
+          box_idx = (i * box_num + j * stride + k * w + l) * 4;
+          funcs::CalcDetectionBox<T>(
+              boxes_data, box, box_idx, img_height, img_width, clip_bbox);
+
+          int label_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 5, iou_aware);
+          int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+          funcs::CalcLabelScore<T>(scores_data,
+                                   input_data,
+                                   label_idx,
+                                   score_idx,
+                                   class_num,
+                                   conf,
+                                   stride);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, CPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3cb17b28e07f3d9d4d0a1671acc9d639b855e08
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       int dim,
+                       DenseTensor* dx);
+}  // phi
diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d76cb0f43702cb5798ec9c2d527464ea51ba1f
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int dim,
+                   DenseTensor* out);
+}  // phi
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd90c7b8f5eee81b517013069ca9c2b366aa7d13
--- /dev/null
+++ b/paddle/phi/kernels/cumsum_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename Functor, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/depthwise_conv_grad_kernel.h b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53
--- /dev/null
+++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {}  // namespace phi
diff --git a/paddle/phi/kernels/depthwise_conv_kernel.h b/paddle/phi/kernels/depthwise_conv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5eff76e90c472f6deda08ebb560b4763337ab53
--- /dev/null
+++ b/paddle/phi/kernels/depthwise_conv_kernel.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {}  // namespace phi
diff --git a/paddle/phi/kernels/diag_grad_kernel.h b/paddle/phi/kernels/diag_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9edab9bec44c367db2d36dfce05c425dcb07785
--- /dev/null
+++ b/paddle/phi/kernels/diag_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index 38912a5ccc442b6ea5fb484b708754dd706ae706..ae5346080d30df9836ee55852f0d7469a3cb7438 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -20,8 +20,8 @@ namespace phi {
 
 template <typename T, typename Context>
 void DigammaGradKernel(const Context& ctx,
-                       const DenseTensor& out_grad,
                        const DenseTensor& x,
+                       const DenseTensor& out_grad,
                        DenseTensor* x_grad);
 
 }  // namepsace phi
diff --git a/paddle/phi/kernels/dist_grad_kernel.h b/paddle/phi/kernels/dist_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f8d7ff21f2fe42de029829a5cfa67df9e0e42b1
--- /dev/null
+++ b/paddle/phi/kernels/dist_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cb3d6e0e8bef394e3c2808aa9337d784586a5b5
--- /dev/null
+++ b/paddle/phi/kernels/dist_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dot_kernel.h b/paddle/phi/kernels/dot_kernel.h
index 9377fba204bea4afea5d5346ee4ad13bb1730586..9c7703440d8aeea4dd518436a5bb62dac2f12519 100644
--- a/paddle/phi/kernels/dot_kernel.h
+++ b/paddle/phi/kernels/dot_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Dot(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DotInferMeta(x, y, &meta_out);
   DotKernel<T, Context>(dev_ctx, x, y, &dense_out);
diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae3f82056632ddde8968b7468eb16030f0c926f5
--- /dev/null
+++ b/paddle/phi/kernels/dropout_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc9f89e08e17ac6c8113f28be6604d638cd6cd3a
--- /dev/null
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_grad_kernel.h b/paddle/phi/kernels/eigh_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..73df76e676a8b4a7eee570f466f9e94f619c5443
--- /dev/null
+++ b/paddle/phi/kernels/eigh_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGardKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..19653918302412e2f7e4dfa8caf71b6c9146a83f
--- /dev/null
+++ b/paddle/phi/kernels/eigh_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index a1b296e326f2198a352b7864bcad54d822492d4e..fb2633cc9fcea7c619193ad964ad62247ed654dd 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -64,4 +64,82 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               int axis,
                               DenseTensor* ddout);
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
+template <typename T, typename Context>
+void ElementwiseFMaxGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad);
+
+template <typename T, typename Context>
+void ElementwiseFMinGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e73ad91c67d415437829d5fc731ac91a5722f5
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ElementwiseFMaxKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
+
+template <typename T, typename Context>
+void ElementwiseFMinKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h
index 0b8d95ee94fb5480684023ec6c71698ba06d9c13..f66f4419fd7f5853565d561751d793b8f10c9b46 100644
--- a/paddle/phi/kernels/empty_kernel.h
+++ b/paddle/phi/kernels/empty_kernel.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -34,28 +34,17 @@ void EmptyLikeKernel(const Context& dev_ctx,
                      DataType dtype,
                      DenseTensor* out);
 
-// TODO(chenweihang): the tensor creation method need to be replaced later,
-// all kernel api call Empty here instead of making tensor self
 template <typename Context>
 DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
-  phi::DenseTensor dense_out(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(meta));
+  phi::DenseTensor dense_out;
+  dense_out.set_meta(meta);
+  dev_ctx.Alloc(&dense_out, dense_out.dtype());
   return dense_out;
 }
 
-template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx) {
-  return Empty(dev_ctx,
-               {paddle::experimental::CppTypeToDataType<T>::Type(),
-                {-1},
-                DataLayout::NCHW});
-}
-
 template <typename T, typename Context>
 DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -65,7 +54,7 @@ DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
 
 template <typename T, typename Context>
 DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/erf_grad_kernel.h b/paddle/phi/kernels/erf_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8957fcaf79b9a0a83323682ffa6efaf28f5362bf
--- /dev/null
+++ b/paddle/phi/kernels/erf_grad_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/phi/kernels/erf_kernel.h
similarity index 73%
rename from paddle/fluid/framework/custom_kernel.h
rename to paddle/phi/kernels/erf_kernel.h
index 31084a34413ea4324c69062303ef84621a463aaf..1d5c57d2201c749f541cabe74b112073c2dace06 100644
--- a/paddle/fluid/framework/custom_kernel.h
+++ b/paddle/phi/kernels/erf_kernel.h
@@ -14,13 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace framework {
+namespace phi {
 
-// Load custom kernel lib and register
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_grad_kernel.h b/paddle/phi/kernels/expand_as_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..675e03c42a34732bf313a1a497e067170f828e33
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..971ea32310f3eab1635bfbedaa0298015f50ac2b
--- /dev/null
+++ b/paddle/phi/kernels/expand_as_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h
index 8b21b8ae40562c979b23e4292a7591d9c6f10cf7..e9e1abffd143324a12809fe784c4138a77352930 100644
--- a/paddle/phi/kernels/eye_kernel.h
+++ b/paddle/phi/kernels/eye_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void EyeKernel(const Context& ctx,
                int64_t num_rows,
                int64_t num_columns,
-               int dtype,
+               DataType dtype,
                DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/flatten_kernel.h b/paddle/phi/kernels/flatten_kernel.h
index de57dcf2e8d3a06b027190aabffb7e7d1d8ebcfe..808af7d9b7beedfc01da7ac53234d9b469c5239f 100644
--- a/paddle/phi/kernels/flatten_kernel.h
+++ b/paddle/phi/kernels/flatten_kernel.h
@@ -40,7 +40,7 @@ DenseTensor Flatten(const Context& dev_ctx,
                     const DenseTensor& x,
                     int start_axis,
                     int stop_axis) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   FlattenInferMeta(x, start_axis, stop_axis, &meta_out);
   FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index c7b1f9af0e3191ec217d2907677ff34edebc551b..41fc96b6db1fae5eb54b24923b68b4491c158d93 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -31,13 +30,6 @@ void FullKernel(const Context& dev_ctx,
                 DataType dtype,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out);
-
 template <typename T, typename Context>
 void FullLikeKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -45,11 +37,23 @@ void FullLikeKernel(const Context& dev_ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void Full(const Context& dev_ctx,
+          const ScalarArray& shape,
+          const Scalar& val,
+          DenseTensor* out) {
+  FullKernel<T, Context>(dev_ctx,
+                         shape,
+                         val,
+                         paddle::experimental::CppTypeToDataType<T>::Type(),
+                         out);
+}
+
 template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
                  const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -61,7 +65,7 @@ template <typename T, typename Context>
 DenseTensor FullLike(const Context& dev_ctx,
                      const DenseTensor& x,
                      const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 8b8697b6df12cf16ff038c67a8f0079a9dbef5b8..942eecae16837ad37718fef540bd73e154d5e88a 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -3,8 +3,12 @@ add_subdirectory(blas)
 add_subdirectory(lapack)
 add_subdirectory(detail)
 
-math_library(math_function DEPS blas dense_tensor tensor)
-math_library(sequence2batch)
+math_library(concat_and_split_functor DEPS dense_tensor)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
-math_library(concat_and_split_functor DEPS dense_tensor)
+math_library(math_function DEPS blas dense_tensor tensor)
+math_library(matrix_reduce DEPS dense_tensor)
+math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
+math_library(pooling DEPS dense_tensor)
+math_library(segment_pooling)
+math_library(sequence2batch)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8fb54bb102d389cf005bac6d0f0edb78fb845ee
--- /dev/null
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -0,0 +1,1220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include <type_traits>
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+namespace funcs {
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+
+template <>
+struct Sine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sin(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+
+template <>
+struct Cosine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(cos(static_cast<float>(val)));
+  }
+};
+
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
+
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+
+template <typename T>
+struct Tangent {
+  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
+};
+
+template <>
+struct Tangent<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(tan(static_cast<float>(val)));
+  }
+};
+
+// Tangent'(x) = -Tangent(x)
+template <typename T>
+struct TanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// Tangent(x) = tan(x)
+template <typename T>
+struct TanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Tangent<T>());
+  }
+};
+
+template <typename T>
+struct Sinh {
+  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
+};
+
+template <>
+struct Sinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sinhf(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosh {
+  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
+};
+
+template <>
+struct Cosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(coshf(static_cast<float>(val)));
+  }
+};
+
+// sinh(x) = sinh(x)
+template <typename T>
+struct SinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sinh<T>());
+  }
+};
+
+// cosh(x) = cosh(x)
+template <typename T>
+struct CoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosh<T>());
+  }
+};
+
+// sinh'(x) = cosh(x)
+template <typename T>
+struct SinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosh'(x) = sinh(x)
+template <typename T>
+struct CoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acosh {
+  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
+};
+
+template <>
+struct Acosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acosh(static_cast<float>(val)));
+  }
+};
+
+// Acosh(x) = acosh(x)
+template <typename T>
+struct AcoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acosh<T>());
+  }
+};
+
+// acosh'(x) =  1/sqrt(x^2 - 1)
+template <typename T>
+struct AcoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asinh {
+  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
+};
+
+template <>
+struct Asinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asinh(static_cast<float>(val)));
+  }
+};
+
+// Asinh(x) = asinh(x)
+template <typename T>
+struct AsinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asinh<T>());
+  }
+};
+
+// asinh'(x) =  1/sqrt(x^2 + 1)
+template <typename T>
+struct AsinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atanh {
+  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
+};
+
+template <>
+struct Atanh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atanh(static_cast<float>(val)));
+  }
+};
+
+// Atanh(x) = atanh(x)
+template <typename T>
+struct AtanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atanh<T>());
+  }
+};
+
+// atanh'(x) =  1/(1 - x^2)
+template <typename T>
+struct AtanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluCPUFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
+      return v > static_cast<T>(0) ? v : static_cast<T>(0);
+    });
+  }
+};
+
+template <typename T>
+struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
+      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> TanhTripleGrad ->    D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
+
+    D_Dout = (-2) * Out * DDx * D_Dout_new
+    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
+    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
+
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
+      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
+                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
+      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
+      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
+                         static_cast<T>(2) * out * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
+  }
+};
+
+template <typename T>
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
+  }
+};
+
+template <typename T>
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 =
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    if (ddOut) {
+      auto* d = dev.eigen_device();
+      auto ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
+      auto x = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto th = static_cast<T>(threshold);
+    out.device(d) = (x > th).template cast<T>() * x;
+  }
+};
+
+template <typename T>
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dout * (x > th).template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / (one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanh(x) = tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * (1 - out^2)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * (one - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaBReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // brelu(x) = min(max(x, t_min), t_max)
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    T temp_max = x > t_min_cast ? x : t_min_cast;
+    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // dx = (x > t_min && x < t_max) ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // thresholded_relu(x) = x > threshold ? x : 0
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > static_cast<T>(threshold) ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = x > threshold ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > static_cast<T>(threshold) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : static_cast<T>(alpha) * x;
+  }
+};
+
+template <typename T>
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout * (x > 0 ? 1 : alpha)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > zero ? dout : static_cast<T>(alpha) * dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..21ebae8487ffc3588034a8ea5feeab8ac1c47fa8
--- /dev/null
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/bitwise_functors.h b/paddle/phi/kernels/funcs/bitwise_functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..db1fc59f534bcf752d1b010508b4a1adcb097651
--- /dev/null
+++ b/paddle/phi/kernels/funcs/bitwise_functors.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
+  template <typename T>                                                      \
+  struct Bitwise##func##Functor {                                            \
+    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
+  };                                                                         \
+                                                                             \
+  template <>                                                                \
+  struct Bitwise##func##Functor<bool> {                                      \
+    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
+      return a bool_expr b;                                                  \
+    }                                                                        \
+  };
+
+BITWISE_BINARY_FUNCTOR(And, &, &&)
+BITWISE_BINARY_FUNCTOR(Or, |, ||)
+BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
+#undef BITWISE_BINARY_FUNCTOR
+
+template <typename T>
+struct BitwiseNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE T operator()(const T a) const { return ~a; }
+};
+
+template <>
+struct BitwiseNotFunctor<bool> {
+  using ELEM_TYPE = bool;
+  HOSTDEVICE bool operator()(const bool a) const { return !a; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index e9fd4cf47b834775c03e9b48ff1e3a5096228fb2..7634c2462738b2b3bdb622e851723aef23045dfd 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize(
               "%d-th output tensor`s shape is not.",
               i));
       out_vec_size = std::min(
-          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
-          out_vec_size);
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
     }
   } else {
-    out_vec_size =
-        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
   }
 
   for (auto *in : ins) {
-    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
     in_vec_size = in->dims() == (*outs)[0]->dims()
                       ? std::min(temp_size, in_vec_size)
                       : in_vec_size;
@@ -594,5 +592,25 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
 
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T>(
+        dev_ctx, x, y, axis, InverseFunctor(), z);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index d5289dcc22cbc546acc4980403e7e4641abe39f1..139341536debf068b82704d5e7d70a3edbe045e0 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -140,5 +140,96 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
   return true;
 }
 
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out [5, 3, 2], which is batch_size of matrix
+static inline std::vector<int64_t> MatrixGetBroadcastBatchPortion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        (x_size == y_size || x_size == 1 || y_size == 1),
+        true,
+        phi::errors::PreconditionNotMet(
+            "The size of tensor x (%d) must match the size of tensor y "
+            "(%d) at non-singleton dimension %d.",
+            x_size,
+            y_size,
+            i));
+
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
+  }
+  return batchPortion;
+}
+
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out shoule be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is
+// batch_size of matrix
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) {
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+
+  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
+  std::vector<int64_t> x_dims_vec_cut(f1, l1);
+
+  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
+  std::vector<int64_t> y_dims_vec_cut(f2, l2);
+
+  std::vector<int64_t> expand_batch_portion =
+      MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_expand_size({expand_batch_portion});
+  x_expand_size.insert(x_expand_size.end(),
+                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
+                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
+
+  std::vector<int64_t> y_expand_size({expand_batch_portion});
+  y_expand_size.insert(y_expand_size.end(),
+                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
+                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
+
+  return std::make_tuple(x_expand_size, y_expand_size);
+}
+
+inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
+  if (s_dims.size() > l_dims.size()) {
+    return GetOutputDims(l_dims, s_dims);
+  }
+  std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
+  for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
+    int64_t s = s_dims[i];
+    int64_t l = l_dims[j];
+    if (s != l) {
+      if (l == 1) {
+        shapes[j] = s;
+      } else if (s != 1) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "The shape of tensor a %s:%d must match shape of tensor b "
+            "%s:%d.",
+            s_dims.to_str(),
+            i,
+            l_dims.to_str(),
+            j));
+      }
+    }
+  }
+  return phi::make_ddim(shapes);
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/compare_functors.h b/paddle/phi/kernels/funcs/compare_functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..569fed7b7fbab90e77c02beecbb18b486c801a4f
--- /dev/null
+++ b/paddle/phi/kernels/funcs/compare_functors.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define COMPARE_FUNCTOR(func_name, op)                           \
+  template <typename InT, typename OutT = bool>                  \
+  struct func_name {                                             \
+    HOSTDEVICE OutT operator()(const InT a, const InT b) const { \
+      return static_cast<OutT>(a op b);                          \
+    }                                                            \
+  };
+
+COMPARE_FUNCTOR(LessThanFunctor, <)
+COMPARE_FUNCTOR(LessEqualFunctor, <=)
+COMPARE_FUNCTOR(GreaterThanFunctor, >)
+COMPARE_FUNCTOR(GreaterEqualFunctor, >=)
+#undef COMPARE_FUNCTOR
+
+template <typename InT, typename OutT = bool>
+struct EqualFunctor {
+  HOSTDEVICE OutT operator()(const InT a, const InT b) const {
+    if (std::is_floating_point<InT>::value) {
+      return static_cast<OutT>(fabs(static_cast<double>(a - b)) < 1e-8);
+    } else {
+      return static_cast<OutT>(a == b);
+    }
+  }
+};
+
+template <typename InT, typename OutT = bool>
+struct NotEqualFunctor {
+  HOSTDEVICE bool operator()(const InT a, const InT b) const {
+    return !EqualFunctor<InT, OutT>()(a, b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 840c8872f50f83c2859f07be2e0e7242a74004a7..06be592dd9375902cdbd0289caa347bc11015bd2 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -395,6 +395,8 @@ struct ConcatFunctor<phi::GPUContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* col_alloc_released = col_alloc.release();
     context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+      VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
+      VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
       paddle::memory::allocation::Allocator::AllocationDeleter(
           data_alloc_released);
       paddle::memory::allocation::Allocator::AllocationDeleter(
diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bb2a5fcfb35bac8a14ba236203ecc9fb68736cd
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cpu_vec.h
@@ -0,0 +1,675 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <functional>
+#include <string>
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+
+#define YMM_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define ZMM_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8
+
+template <typename T>
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+template <>
+inline void vec_exp<float>(const int n, const float* x, float* y) {
+  constexpr int small_enough = 128;
+  if (n < small_enough) {
+    for (int i = 0; i < n; ++i) {
+      y[i] = std::exp(x[i]);
+    }
+  } else {
+    paddle::platform::dynload::vsExp(n, x, y);
+  }
+}
+
+template <>
+inline void vec_exp<double>(const int n, const double* x, double* y) {
+  paddle::platform::dynload::vdExp(n, x, y);
+}
+
+template <>
+inline void vec_scal<float>(const int n, const float a, float* x) {
+  paddle::platform::dynload::cblas_sscal(n, a, x, 1);
+}
+
+template <>
+inline void vec_scal<double>(const int n, const double a, double* x) {
+  paddle::platform::dynload::cblas_dscal(n, a, x, 1);
+}
+#endif
+
+// MKL scal only support inplace, choose this if src and dst are not equal
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx>(const int n,
+                                                   const float a,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_scal<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = _mm256_loadu_ps(x + i);     \
+  tmp = _mm256_mul_ps(tmp, scalar); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx2>(const int n,
+                                                    const float a,
+                                                    const float* x,
+                                                    float* y) {
+  vec_scal<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_scal<float, paddle::platform::avx512f>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+  // TODO(TJ): enable me
+  vec_scal<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_sum(const size_t n, const T* x, T* s) {
+  s[0] = x[0];
+  for (size_t i = 1; i < n; ++i) {
+    s[0] += x[i];
+  }
+}
+
+template <>
+inline void vec_sum<float, paddle::platform::avx>(const size_t n,
+                                                  const float* x,
+                                                  float* s) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sum<float, paddle::platform::isa_any>(n, x, s);
+    return;
+  }
+
+  unsigned int i, end;
+  i = end = 0;
+  s[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = _mm256_setzero_ps();
+  for (i = 0; i < end; i += block) {
+    tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
+  }
+
+  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
+  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
+  _mm_store_ss(
+      s,
+      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
+
+  for (; i < n; i++) {
+    s[0] += x[i];
+  }
+#else
+  vec_sum<float, paddle::platform::isa_any>(n, x, s);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
+  for (size_t i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul<float, paddle::platform::avx>(const size_t n,
+                                                  const float* x,
+                                                  const float* y,
+                                                  float* z) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul<float, paddle::platform::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  for (i = 0; i < end; i += block) {
+    _mm256_storeu_ps(
+        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
+  }
+
+  for (; i < n; i++) {
+    z[i] = x[i] * y[i];
+  }
+#else
+  vec_mul<float, paddle::platform::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
+  z[0] = x[0] * y[0];
+  for (size_t i = 1; i < n; ++i) {
+    z[0] += x[i] * y[i];
+  }
+}
+
+template <>
+inline void vec_mul_reduce<float, paddle::platform::avx>(const size_t n,
+                                                         const float* x,
+                                                         const float* y,
+                                                         float* z) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  z[0] = 0.f;
+
+  end = n & ~(block - 1);
+  __m256 tmp = _mm256_setzero_ps();
+  for (i = 0; i < end; i += block) {
+    tmp = _mm256_add_ps(
+        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
+  }
+
+  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
+  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
+  _mm_store_ss(
+      z,
+      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
+
+  for (; i < n; i++) {
+    z[0] += x[i] * y[i];
+  }
+#else
+  vec_mul_reduce<float, paddle::platform::isa_any>(n, x, y, z);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx2>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
+  vec_bias_sub<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, paddle::platform::avx512f>(const int n,
+                                                           const float a,
+                                                           const float* x,
+                                                           float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, paddle::platform::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx2>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  vec_cross<float, paddle::platform::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, paddle::platform::avx512f>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, paddle::platform::avx>(n, x, y, z, out);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
+  for (size_t i = 0; i < n; ++i) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+}
+
+template <>
+inline void vec_clip<float, paddle::platform::avx>(const size_t n,
+                                                   const float a,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr unsigned int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_clip<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+
+  unsigned int i = 0, end = 0;
+  end = n & ~(block - 1);
+  __m256 threshold = _mm256_set1_ps(a);
+
+  for (i = 0; i < end; i += block) {
+    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
+  }
+
+  for (; i < n; i++) {
+    y[i] = x[i] < a ? a : x[i];
+  }
+#else
+  vec_clip<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx>(const int n,
+                                                       const float a,
+                                                       const float* x,
+                                                       float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, paddle::platform::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx2>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
+  vec_add_bias<float, paddle::platform::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, paddle::platform::avx512f>(const int n,
+                                                           const float a,
+                                                           const float* x,
+                                                           float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, paddle::platform::avx2>(n, a, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx>(const int n,
+                                                      const float* x,
+                                                      float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, paddle::platform::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, min);   \
+  tmp = _mm256_min_ps(tmp, max);   \
+  tmp = _mm256_sub_ps(zeros, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest != 0) {
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
+  }
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = _mm256_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(y + i);   \
+  tmp = _mm256_add_ps(ones, tmp); \
+  tmp = _mm256_div_ps(ones, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, paddle::platform::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx2>(const int n,
+                                                       const float* x,
+                                                       float* y) {
+  vec_sigmoid<float, paddle::platform::avx>(n, x, y);
+}
+
+template <>
+inline void vec_sigmoid<float, paddle::platform::avx512f>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_sigmoid<float, paddle::platform::avx2>(n, x, y);
+}
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
+}
+
+// TODO(TJ): make relu clip
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx>(const int n,
+                                                   const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = YMM_FLOAT_BLOCK;
+  if (n < block * 4) {
+    vec_relu<float, paddle::platform::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, paddle::platform::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx2>(const int n,
+                                                    const float* x,
+                                                    float* y) {
+  vec_relu<float, paddle::platform::avx>(n, x, y);
+}
+
+template <>
+inline void vec_relu<float, paddle::platform::avx512f>(const int n,
+                                                       const float* x,
+                                                       float* y) {
+  // TODO(TJ): enable me
+  vec_relu<float, paddle::platform::avx2>(n, x, y);
+}
+
+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
+template <typename T,
+          paddle::platform::cpu_isa_t isa = paddle::platform::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Expected type should be one of sigmod, relu, tanh, identity. But got "
+        "not support type: %s.",
+        type));
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac40523c1c4378b3b8b20e9f32c59e664ee1eafc
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cumprod.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+static void GetCumprodDimInfo(const DDim& dim,
+                              int cumprod_dim,
+                              size_t* outer_dim,
+                              size_t* mid_dim,
+                              size_t* inner_dim) {
+  PADDLE_ENFORCE_GE(
+      cumprod_dim,
+      -dim.size(),
+      phi::errors::InvalidArgument(
+          "The input dim of CumprodOp should be larger than the opposite "
+          "rank of input x which is %d.But received dim=%d",
+          -dim.size(),
+          cumprod_dim));
+  PADDLE_ENFORCE_LT(cumprod_dim,
+                    dim.size(),
+                    phi::errors::InvalidArgument(
+                        "The input dim of CumprodOp should be smaller than the "
+                        "rank of input x which is %d.But received dim=%d",
+                        dim.size(),
+                        cumprod_dim));
+  if (cumprod_dim < 0) cumprod_dim += dim.size();
+
+  *outer_dim = 1;
+  for (int i = 0; i < cumprod_dim; ++i) {
+    *outer_dim *= dim[i];
+  }
+  *mid_dim = dim[cumprod_dim];
+  *inner_dim = 1;
+  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
+    *inner_dim *= dim[i];
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad7f2aa192ce4385d537f232398352b4f9d0aaa6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/data_type_transform.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+phi::DenseTensor TransDataType(const Context& dev_ctx,
+                               const phi::DenseTensor& x,
+                               DataType dtype) {
+  VLOG(3) << "TransDataType "
+          << "src type:" << x.dtype() << "; dst typoe: " << dtype;
+
+  switch (x.dtype()) {
+    case DataType::FLOAT32:
+      return phi::Cast<float>(dev_ctx, x, dtype);
+    case DataType::FLOAT64:
+      return phi::Cast<double>(dev_ctx, x, dtype);
+    case DataType::INT32:
+      return phi::Cast<int32_t>(dev_ctx, x, dtype);
+    case DataType::INT64:
+      return phi::Cast<int64_t>(dev_ctx, x, dtype);
+    case DataType::FLOAT16:
+      return phi::Cast<phi::dtype::float16>(dev_ctx, x, dtype);
+    case DataType::BFLOAT16:
+      return phi::Cast<phi::dtype::bfloat16>(dev_ctx, x, dtype);
+    case DataType::BOOL:
+      return phi::Cast<bool>(dev_ctx, x, dtype);
+    case DataType::INT16:
+      return phi::Cast<int16_t>(dev_ctx, x, dtype);
+    case DataType::UINT8:
+      return phi::Cast<uint8_t>(dev_ctx, x, dtype);
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          x.dtype()));
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
index a806d1583a0b363d44aa9f0cf3b3a64f4a8ea6ff..1862f5ec91b4bc2aeafa9263d89fcbc5b8dd600b 100644
--- a/paddle/phi/kernels/funcs/diag_functor.h
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -14,6 +14,14 @@
 
 #pragma once
 
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
 namespace phi {
 namespace funcs {
 
@@ -25,5 +33,96 @@ inline int ComputeStride(int axis, phi::DDim dims) {
   return size;
 }
 
+template <typename T, typename ValueType>
+struct DiagAndFillFunctor {
+  DiagAndFillFunctor(const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const ValueType* scale,
+                     const T* input,
+                     T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = input_[index];
+    } else if (col == band_end - 1) {
+      output_[index] = static_cast<T>(scale_[index % m_]);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+ private:
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const ValueType* scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename ValueType, typename Context>
+DenseTensor DiagFill(const Context& dev_ctx,
+                     const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const DenseTensor& scale,
+                     const DenseTensor& input) {
+  DenseTensor out;
+  out.Resize(input.dims());
+  dev_ctx.template Alloc<T>(&out);
+  funcs::ForRange<Context> for_range(dev_ctx, input.numel());
+  DiagAndFillFunctor<T, ValueType> diag_and_copy_functor(
+      m,
+      n,
+      num_lower_diags,
+      num_upper_diags,
+      scale.data<ValueType>(),
+      input.data<T>(),
+      out.data<T>());
+  for_range(diag_and_copy_functor);
+  return out;
+}
+
+template <typename T, typename Context>
+DenseTensor BatchDiag(const Context& dev_ctx, const DenseTensor& x, int batch) {
+  DenseTensor out;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  auto numel = x.numel();
+  out.Resize(x.dims());
+  auto* out_data = dev_ctx.template HostAlloc<phi::dtype::Real<T>>(
+      &out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
+
+  auto x_dims = x.dims();
+  int num_dims = x_dims.size();
+  std::vector<int> out_shape;
+
+  for (int i = 0; i < num_dims - 1; ++i) {
+    out_shape.push_back(x.dims()[i]);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  int order = x.dims()[num_dims - 1];
+  int stride_out = order * order;
+  int stride_in = order + 1;
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < order; ++j) {
+      out_data[i * order + j] = x_data[stride_out * i + stride_in * j];
+    }
+  }
+  return out;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index f0793fb9d27db68f22bc2bc27978844072c61616..acc31d68b785908a88fea14e5826ccfde9ff5970 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -23,11 +23,13 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
@@ -254,11 +256,13 @@ __global__ void DistributionKernel(size_t size,
   using SType = hiprandStatePhilox4_32_10_t;
 #endif
   size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT args[kCount];
   T result[kCount];
   for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
+    kps::ElementwiseRandom<SType, MT, kCount, 1, DistOp>(
+        &args[0], dist, &state);
+    kps::ElementwiseUnary<MT, T, kCount, 1, 1, TransformOp>(
         &result[0], &args[0], trans);
     kps::WriteData<T, T, kCount, 1, 1, true>(
         out_data + i, &result[0], size - i, 1, stride, 1);
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 235dbdd40f6b7db5524251aec80b92cdc22aa819..332ec0b0312da96ca21b2c616440afc57a62edc2 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -23,9 +23,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #define HOSTDEVICE __host__ __device__
@@ -546,9 +546,8 @@ struct VecSizeGetter {
                                const ArgsT &args,
                                int *vec_size) {
     using Type = std::tuple_element_t<Index, ArgsT>;
-    *vec_size = std::min<int>(
-        *vec_size,
-        paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
+    *vec_size = std::min<int>(*vec_size,
+                              phi::GetVectorizedSize(ins[Index]->data<Type>()));
   }
 };
 
@@ -563,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
   // The Arg VecSize=1 is to match the Unroller template.
   Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
+    vec_size =
+        std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
   }
   return vec_size;
 }
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index c0a3985cd1713b3bf66b278de0fd17cb408e0f63..ac262fe2d571e587e3bdfa6a2d4e58bd5b865e68 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -66,6 +67,11 @@ struct InverseMultiplyFunctor<bool> {
   }
 };
 
+template <typename T>
+struct IsZeroFunctor {
+  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
+};
+
 // Divide
 #define DIV_ERROR_INFO                                             \
   "InvalidArgumentError: Integer division by zero encountered in " \
@@ -92,5 +98,329 @@ struct InverseDivideFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; }
 };
 
+template <typename T>
+using ComplexType = phi::dtype::complex<T>;
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    // dx = dout / y
+    // dy = - dout * out / y
+    phi::Array<OutT, 2> outs;
+    outs[0] = a / c;
+    outs[1] = -a * b / c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    ComplexType<InT> out_div_c_conj((b / c).real, -(b / c).imag);
+    outs[0] = a / c_conj;
+    outs[1] = -a * out_div_c_conj;
+    return outs;
+  }
+};
+
+// Float div grad
+template <typename T>
+struct DivGradXFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+// ComplexType div grad
+template <typename T>
+struct DivGradXFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a / b_conj;
+  }
+};
+
+// Float mul and div
+template <typename T>
+struct DivGradYFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
+    return -a * b / c;
+  }
+};
+
+// ComplexType mul and div
+template <typename T>
+struct DivGradYFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b,
+                                              const ComplexType<T> c) const {
+    ComplexType<T> out_div_c_conj((b / c).real, -(b / c).imag);
+    return -a * out_div_c_conj;
+  }
+};
+// Fmin
+template <typename T>
+struct FMinFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return std::fmin(a, b);
+  }
+};
+
+template <>
+struct FMinFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return static_cast<dtype::float16>(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int> {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmin(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
+// Fmax
+template <typename T>
+struct FMaxFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return std::fmax(a, b);
+  }
+};
+
+template <>
+struct FMaxFunctor<dtype::float16> {
+  inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a,
+                                              const dtype::float16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return static_cast<dtype::float16>(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int> {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmax(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
+template <typename T>
+struct FMaxGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>((x >= y) || isnan(y));
+  }
+};
+
+template <>
+struct FMaxGradDx<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>((x >= y) || dtype::isnan(y));
+  }
+};
+
+template <>
+struct FMaxGradDx<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>((x >= y));
+  }
+};
+
+template <>
+struct FMaxGradDx<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>((x >= y));
+  }
+};
+
+template <typename T>
+struct FMaxGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(!((x >= y) || isnan(y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>(!((x >= y) || dtype::isnan(y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>(!((x >= y)));
+  }
+};
+
+template <>
+struct FMaxGradDy<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>(!((x >= y)));
+  }
+};
+
+template <typename T>
+struct FMinGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>((x <= y) || isnan(y));
+  }
+};
+
+template <>
+struct FMinGradDx<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>((x <= y) || dtype::isnan(y));
+  }
+};
+
+template <>
+struct FMinGradDx<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>((x <= y));
+  }
+};
+
+template <>
+struct FMinGradDx<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>((x <= y));
+  }
+};
+
+template <typename T>
+struct FMinGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(!((x <= y) || isnan(y)));
+  }
+};
+
+template <>
+struct FMinGradDy<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(dtype::float16 x,
+                                       dtype::float16 y,
+                                       dtype::float16 out,
+                                       dtype::float16 dout) const {
+    return dout * static_cast<dtype::float16>(!((x <= y) || dtype::isnan(y)));
+  }
+};
+
+template <>
+struct FMinGradDy<int> {
+  HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
+    return dout * static_cast<int>(!((x <= y)));
+  }
+};
+
+template <>
+struct FMinGradDy<int64_t> {
+  HOSTDEVICE int64_t operator()(int64_t x,
+                                int64_t y,
+                                int64_t out,
+                                int64_t dout) const {
+    return dout * static_cast<int64_t>(!((x <= y)));
+  }
+};
+
+template <typename T>
+struct MultiplyGradFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
+};
+template <typename T>
+struct MultiplyGradFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    // dx = dout * y
+    ComplexType<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index dff0cfe5b8b90155f62aa6e465c1fd6450d64807..17bf873587381c54efa23883f10e830c296365ec 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -24,6 +24,7 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #endif
 
@@ -1758,5 +1759,31 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
 
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradCompute(const DeviceContext &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &y,
+                         const DenseTensor &out,
+                         const DenseTensor &dout,
+                         int axis,
+                         DenseTensor *dx,
+                         DenseTensor *dy,
+                         DX_OP dx_op,
+                         DY_OP dy_op) {
+  const DDim &x_dim = x.dims();
+  const DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/phi/kernels/funcs/isfinite_functor.h
similarity index 52%
rename from paddle/fluid/operators/isfinite_v2_op.h
rename to paddle/phi/kernels/funcs/isfinite_functor.h
index b646e460ec75b6dfd1dad6f9875040ffc97c99de..c804bee8d4c68c55536a3c5841799887dc9e5bd2 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,39 +14,32 @@
 
 #pragma once
 
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace operators {
+namespace funcs {
 
 struct InfinityV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsInfV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsInfV2(tensor, out);
   }
 };
 
 struct NANV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsNANV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsNANV2(tensor, out);
   }
 };
 
 struct IsfiniteV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorIsfiniteV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorIsfiniteV2(tensor, out);
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
index ffff5ae8abe2a796fc66c971694cbe819494ab90..1a53470b2e6041948e48452356bfec8685f3e617 100644
--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
@@ -1 +1 @@
-math_library(lapack_function DEPS dynload_lapack)
+math_library(lapack_function DEPS phi_dynload_lapack)
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 0407b8fd48960df9b674e6537bbaed2e96ef3440..0f887dce4b4dab17643b202f87b588bc270f8d6d 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/fluid/platform/dynload/lapack.h"
+#include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
@@ -22,12 +22,12 @@ namespace funcs {
 // LU (for example)
 template <>
 void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 template <>
 void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
-  paddle::platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
+  dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
 }
 
 // eigh
@@ -47,7 +47,7 @@ void lapackEigh<float>(char jobz,
                        int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::ssyevd_(
+  dynload::ssyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -67,7 +67,7 @@ void lapackEigh<double>(char jobz,
                         int *info) {
   (void)rwork;   // unused
   (void)lrwork;  // unused
-  paddle::platform::dynload::dsyevd_(
+  dynload::dsyevd_(
       &jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }
 
@@ -86,20 +86,19 @@ void lapackEigh<phi::dtype::complex<float>, float>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::cheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::cheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<float> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 template <>
@@ -117,20 +116,19 @@ void lapackEigh<phi::dtype::complex<double>, double>(
     int *iwork,
     int liwork,
     int *info) {
-  paddle::platform::dynload::zheevd_(
-      &jobz,
-      &uplo,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      w,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      &lrwork,
-      iwork,
-      &liwork,
-      info);
+  dynload::zheevd_(&jobz,
+                   &uplo,
+                   &n,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   w,
+                   reinterpret_cast<std::complex<double> *>(work),
+                   &lwork,
+                   rwork,
+                   &lrwork,
+                   iwork,
+                   &liwork,
+                   info);
 }
 
 // Eig
@@ -152,20 +150,20 @@ void lapackEig<double>(char jobvl,
   double *wr = w;
   double *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::dgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::dgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -186,20 +184,20 @@ void lapackEig<float>(char jobvl,
   float *wr = w;
   float *wi = w + n;
   (void)rwork;  // unused
-  paddle::platform::dynload::sgeev_(&jobvl,
-                                    &jobvr,
-                                    &n,
-                                    a,
-                                    &lda,
-                                    wr,
-                                    wi,
-                                    vl,
-                                    &ldvl,
-                                    vr,
-                                    &ldvr,
-                                    work,
-                                    &lwork,
-                                    info);
+  dynload::sgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  a,
+                  &lda,
+                  wr,
+                  wi,
+                  vl,
+                  &ldvl,
+                  vr,
+                  &ldvr,
+                  work,
+                  &lwork,
+                  info);
 }
 
 template <>
@@ -218,21 +216,20 @@ void lapackEig<phi::dtype::complex<double>, double>(
     int lwork,
     double *rwork,
     int *info) {
-  paddle::platform::dynload::zgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(w),
-      reinterpret_cast<std::complex<double> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<double> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<double> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::zgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<double> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<double> *>(w),
+                  reinterpret_cast<std::complex<double> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<double> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<double> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -251,21 +248,20 @@ void lapackEig<phi::dtype::complex<float>, float>(
     int lwork,
     float *rwork,
     int *info) {
-  paddle::platform::dynload::cgeev_(
-      &jobvl,
-      &jobvr,
-      &n,
-      reinterpret_cast<std::complex<float> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<float> *>(w),
-      reinterpret_cast<std::complex<float> *>(vl),
-      &ldvl,
-      reinterpret_cast<std::complex<float> *>(vr),
-      &ldvr,
-      reinterpret_cast<std::complex<float> *>(work),
-      &lwork,
-      rwork,
-      info);
+  dynload::cgeev_(&jobvl,
+                  &jobvr,
+                  &n,
+                  reinterpret_cast<std::complex<float> *>(a),
+                  &lda,
+                  reinterpret_cast<std::complex<float> *>(w),
+                  reinterpret_cast<std::complex<float> *>(vl),
+                  &ldvl,
+                  reinterpret_cast<std::complex<float> *>(vr),
+                  &ldvr,
+                  reinterpret_cast<std::complex<float> *>(work),
+                  &lwork,
+                  rwork,
+                  info);
 }
 
 template <>
@@ -280,8 +276,7 @@ void lapackGels<double>(char trans,
                         double *work,
                         int lwork,
                         int *info) {
-  paddle::platform::dynload::dgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -296,8 +291,7 @@ void lapackGels<float>(char trans,
                        float *work,
                        int lwork,
                        int *info) {
-  paddle::platform::dynload::sgels_(
-      &trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+  dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
 }
 
 template <>
@@ -316,20 +310,20 @@ void lapackGelsd<double>(int m,
                          double *rwork,
                          int *iwork,
                          int *info) {
-  paddle::platform::dynload::dgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::dgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -348,20 +342,20 @@ void lapackGelsd<float>(int m,
                         float *rwork,
                         int *iwork,
                         int *info) {
-  paddle::platform::dynload::sgelsd_(&m,
-                                     &n,
-                                     &nrhs,
-                                     a,
-                                     &lda,
-                                     b,
-                                     &ldb,
-                                     s,
-                                     &rcond,
-                                     rank,
-                                     work,
-                                     &lwork,
-                                     iwork,
-                                     info);
+  dynload::sgelsd_(&m,
+                   &n,
+                   &nrhs,
+                   a,
+                   &lda,
+                   b,
+                   &ldb,
+                   s,
+                   &rcond,
+                   rank,
+                   work,
+                   &lwork,
+                   iwork,
+                   info);
 }
 
 template <>
@@ -379,7 +373,7 @@ void lapackGelsy<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelsy_(
+  dynload::dgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -398,7 +392,7 @@ void lapackGelsy<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelsy_(
+  dynload::sgelsy_(
       &m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond, rank, work, &lwork, info);
 }
 
@@ -417,7 +411,7 @@ void lapackGelss<double>(int m,
                          int lwork,
                          double *rwork,
                          int *info) {
-  paddle::platform::dynload::dgelss_(
+  dynload::dgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -436,7 +430,7 @@ void lapackGelss<float>(int m,
                         int lwork,
                         float *rwork,
                         int *info) {
-  paddle::platform::dynload::sgelss_(
+  dynload::sgelss_(
       &m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank, work, &lwork, info);
 }
 
@@ -450,15 +444,14 @@ void lapackCholeskySolve<phi::dtype::complex<double>>(
     phi::dtype::complex<double> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::zpotrs_(
-      &uplo,
-      &n,
-      &nrhs,
-      reinterpret_cast<std::complex<double> *>(a),
-      &lda,
-      reinterpret_cast<std::complex<double> *>(b),
-      &ldb,
-      info);
+  dynload::zpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<double> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<double> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -471,14 +464,14 @@ void lapackCholeskySolve<phi::dtype::complex<float>>(
     phi::dtype::complex<float> *b,
     int ldb,
     int *info) {
-  paddle::platform::dynload::cpotrs_(&uplo,
-                                     &n,
-                                     &nrhs,
-                                     reinterpret_cast<std::complex<float> *>(a),
-                                     &lda,
-                                     reinterpret_cast<std::complex<float> *>(b),
-                                     &ldb,
-                                     info);
+  dynload::cpotrs_(&uplo,
+                   &n,
+                   &nrhs,
+                   reinterpret_cast<std::complex<float> *>(a),
+                   &lda,
+                   reinterpret_cast<std::complex<float> *>(b),
+                   &ldb,
+                   info);
 }
 
 template <>
@@ -490,7 +483,7 @@ void lapackCholeskySolve<double>(char uplo,
                                  double *b,
                                  int ldb,
                                  int *info) {
-  paddle::platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 template <>
@@ -502,7 +495,7 @@ void lapackCholeskySolve<float>(char uplo,
                                 float *b,
                                 int ldb,
                                 int *info) {
-  paddle::platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+  dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 8e1a4cdd1a9688a12d7f0a8b5ba088f6abfc9512..b735587d3d53df03dfb82f7e3657a6b0c2cabd9b 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -125,5 +125,43 @@ struct TensorSetConstantXPU {
 };
 #endif
 
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c95e97f8ea81a566972c162663b100ad562608b1
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+}
+
+template class MatrixInverseFunctor<CPUContext, float>;
+template class MatrixInverseFunctor<CPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..686b8405bf7502c58bc07dbfa6d0542401853c0e
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data;
+  const T* gpu_mat = a.data<T>();
+  if (n >= 32) {
+    // Copy all elements of input matrix A to a temporary memory space to
+    // avoid being overriden by getrf.
+    tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T));
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_mat_data->ptr(),
+                         dev_ctx.GetPlace(),
+                         a.data(),
+                         a.numel() * sizeof(T),
+                         dev_ctx.stream());
+    gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
+  }
+
+  std::vector<const T*> cpu_ptrs(batch_size * 2);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_ptrs[i] = gpu_mat + i * n * n;
+    cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
+  }
+
+  // Copy the addresses of A and A_inv from host to device.
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_gpu_ptrs_data->ptr(),
+                       phi::CPUPlace(),
+                       static_cast<void*>(cpu_ptrs.data()),
+                       cpu_ptrs.size() * sizeof(T*),
+                       dev_ctx.stream());
+  T** gpu_inv_ptrs =
+      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+  // Allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
+      paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int));
+  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  std::vector<int> info;  // only for singular checking
+  info.resize(batch_size);
+  // This functions in cuBLAS is intended to be used for matrices of small
+  // sizes where the launch overhead is a significant factor.
+  // TODO(Xreki): call function in cusolver for large matrices.
+  if (n < 32) {
+    // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
+    // plus cublas<S/D>getriBatched.
+    // However it only works if N is less than 32. If not, we need to
+    // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
+    blas.BatchedMatInv(n,
+                       reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                       gpu_inv_ptrs,
+                       gpu_info_ptr,
+                       batch_size);
+  } else {
+    // This function performs the LU factorization of each matrix A by the
+    // equation P * A = L * U. L and U are written back to original matrix A,
+    // and diagonal elements of L are discarded.
+    int* gpu_pivot_ptr =
+        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    blas.BatchedGETRF(n,
+                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_info_ptr,
+                      batch_size);
+
+    blas.BatchedGETRI(n,
+                      reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_inv_ptrs,
+                      gpu_info_ptr,
+                      batch_size);
+  }
+  paddle::memory::Copy(phi::CPUPlace(),
+                       info.data(),
+                       dev_ctx.GetPlace(),
+                       gpu_info_ptr,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  for (int i = 0; i < batch_size; ++i) {
+    PADDLE_ENFORCE_EQ(info[i],
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U. "
+                          "Please check the matrix value and change it to a "
+                          "non-singular matrix",
+                          i,
+                          info[i],
+                          info[i]));
+  }
+#else
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+#endif
+}
+
+template class MatrixInverseFunctor<GPUContext, float>;
+template class MatrixInverseFunctor<GPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
+                                    double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
similarity index 61%
rename from paddle/fluid/operators/math/matrix_inverse.h
rename to paddle/phi/kernels/funcs/matrix_inverse.h
index fb58b483666526c0b7e745d1e49e308235871d57..1c6756f1720a23ada5bb4ff2fdb4f4840660ed58 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,17 +17,18 @@ limitations under the License. */
 #include <string>
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
-template <typename DeviceContext, typename T>
-void compute_inverse_eigen(const DeviceContext& context,
-                           const framework::Tensor& a,
-                           framework::Tensor* a_inv) {
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void ComputeInverseEigen(const Context& dev_ctx,
+                         const DenseTensor& a,
+                         DenseTensor* a_inv) {
   using Matrix =
       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using EigenMatrixMap = Eigen::Map<Matrix>;
@@ -38,7 +39,7 @@ void compute_inverse_eigen(const DeviceContext& context,
   int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
 
   const T* a_ptr = a.data<T>();
-  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+  T* a_inv_ptr = dev_ctx.template Alloc<T>(a_inv);
 
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
@@ -47,20 +48,20 @@ void compute_inverse_eigen(const DeviceContext& context,
     lu.compute(mat);
 
     const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-    PADDLE_ENFORCE_GT(
-        min_abs_pivot, static_cast<T>(0),
-        platform::errors::InvalidArgument("Input is not invertible."));
+    PADDLE_ENFORCE_GT(min_abs_pivot,
+                      static_cast<T>(0),
+                      errors::InvalidArgument("Input is not invertible."));
     mat_inv.noalias() = lu.inverse();
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename Context, typename T>
 class MatrixInverseFunctor {
  public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  framework::Tensor* a_inv);
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& a,
+                  DenseTensor* a_inv);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..849fd7a0075a89cedeab4d87c779931f2a14f115
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int64_t> in_dims = phi::vectorize<int64_t>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int64_t> out_dims = phi::vectorize<int64_t>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int64_t> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    phi::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
+        dev_ctx, in, out, out_reduce_dims, true, false);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, CPUContext>;
+template class MatrixReduceSumFunctor<double, CPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5c3ebd6bb01671eab670b477c0d97a962b2eaea0
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int> in_dims = phi::vectorize<int>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int> out_dims = phi::vectorize<int>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, in, out, kps::IdentityFunctor<T>(), out_reduce_dims);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, GPUContext>;
+template class MatrixReduceSumFunctor<double, GPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.h b/paddle/phi/kernels/funcs/matrix_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..22bddacd43d437e731ff5baf3e3c18c52dc55fd6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
+
+// Use For Matrix OP, reduce_sum 'in' according to out's dim
+// for example: in's dim = [5, 3, 2, M, N] ; out's dim = [3, 1, M, N]
+// axis [0, 2] of DenseTensor 'in' will be reduced
+template <typename T, typename Context>
+class MatrixReduceSumFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out);
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/phi/kernels/funcs/padding.h
similarity index 67%
rename from paddle/fluid/operators/math/padding.h
rename to paddle/phi/kernels/funcs/padding.h
index 529d39c9ba50f016434b0b14c4d85c84483bad7f..e2c4e766b605b570463da12c39c456923c439916 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,21 +15,26 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using EigenTensor = EigenTensor<T, D, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context,
-                 const std::vector<int>& pads, const framework::Tensor& src,
-                 T pad_value, framework::Tensor* out) {
+void PadFunction(const DeviceContext& context,
+                 const std::vector<int>& pads,
+                 const DenseTensor& src,
+                 T pad_value,
+                 DenseTensor* out) {
   std::array<std::pair<int64_t, int64_t>, D> paddings;
 
   for (size_t i = 0; i < paddings.size(); ++i) {
@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context,
   auto src_tensor = EigenTensor<T, D>::From(src);
   auto out_tensor = EigenTensor<T, D>::From(*out);
 
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *(context.eigen_device());
   EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
       place, out_tensor, src_tensor, paddings, pad_value);
 }
 
 template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context,
-                     const std::vector<int>& pads, const framework::Tensor& src,
-                     framework::Tensor* d_out) {
+void PadGradFunction(const DeviceContext& context,
+                     const std::vector<int>& pads,
+                     const DenseTensor& src,
+                     DenseTensor* d_out) {
   std::array<std::pair<int64_t, int64_t>, D> paddings;
   for (size_t i = 0; i < paddings.size(); ++i) {
     paddings[i].first = -pads[i * 2];
@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context,
 
   auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
   auto src_tensor = EigenTensor<T, D>::From(src);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  auto& place = *(context.eigen_device());
   EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
       place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
 }
 
 template <typename DeviceContext, typename T>
-void PaddingFunctor(int rank, const framework::ExecutionContext& context,
-                    const std::vector<int>& pads, T pad_value,
-                    const framework::Tensor& src, framework::Tensor* out) {
+void PaddingFunctor(int rank,
+                    const DeviceContext& context,
+                    const std::vector<int>& pads,
+                    T pad_value,
+                    const DenseTensor& src,
+                    DenseTensor* out) {
   switch (rank) {
     case 1:
       PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
       PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "PadOp only support tensors with no more"
-          " than 6 dimensions currently."));
+      PADDLE_THROW(
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
+                                     " than 6 dimensions currently."));
   }
 }
 
 template <typename DeviceContext, typename T>
-void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
+void PaddingGradFunctor(int rank,
+                        const DeviceContext& context,
                         const std::vector<int>& pads,
-                        const framework::Tensor& src, framework::Tensor* out) {
+                        const DenseTensor& src,
+                        DenseTensor* out) {
   switch (rank) {
     case 1:
       PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
@@ -118,9 +127,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
       PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "PadOp only support tensors with no more"
-          " than 6 dimensions currently."));
+      PADDLE_THROW(
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
+                                     " than 6 dimensions currently."));
   }
 }
 
@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads,
   }
   return is_sys_pad;
 }
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
similarity index 83%
rename from paddle/fluid/operators/math/pooling.cc
rename to paddle/phi/kernels/funcs/pooling.cc
index f2e5e955ec487585deee1cbebba3d2932ee1b05d..10c88b9798c6ff69b755aa2c7423558c35afe859 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,11 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/pooling.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
+namespace phi {
+namespace funcs {
 
 /*
 * Tensors are in NCHW or NHWC format.
@@ -25,13 +29,16 @@ namespace math {
 * height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool2dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -50,7 +57,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int output_stride = output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -101,12 +108,16 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     }
   }
 
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -131,7 +142,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -244,14 +255,19 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, bool exclusive, bool adaptive,
-      framework::Tensor* input_grad, PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -270,7 +286,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -324,13 +340,18 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     }
   }
 
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad,
-      PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -357,7 +378,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int hstart = 0, hend = 1;
     int wstart = 0, wend = 1;
@@ -451,10 +472,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       h * input_width * input_channels + w * input_channels + c;
                   auto output_idx = ph * output_width * output_channels +
                                     pw * output_channels + c;
-                  pool_grad_process.compute(
-                      input_data[input_idx], output_data[output_idx],
-                      output_grad_data[output_idx], static_cast<T>(scale),
-                      input_grad_data + input_idx);
+                  pool_grad_process.compute(input_data[input_idx],
+                                            output_data[output_idx],
+                                            output_grad_data[output_idx],
+                                            static_cast<T>(scale),
+                                            input_grad_data + input_idx);
                 }
               }
             }
@@ -477,13 +499,16 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_down, width_left and width_right, respectively.
 */
 template <class T>
-class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
+class MaxPool2dGradFunctor<CPUContext, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -502,7 +527,7 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -536,12 +561,15 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -568,7 +596,7 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     if (!channel_last) {
       const int input_stride = input_height * input_width;
@@ -641,29 +669,17 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
+template class MaxPool2dGradFunctor<CPUContext, float>;
+template class MaxPool2dGradFunctor<CPUContext, double>;
+
+template class Pool2dFunctor<CPUContext, MaxPool<float>, float>;
+template class Pool2dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<CPUContext, MaxPool<double>, double>;
+template class Pool2dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
 * Tensors are in NCDHW or NDHWC format.
@@ -674,13 +690,16 @@ template class Pool2dGradFunctor<platform::CPUDeviceContext,
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
@@ -704,7 +723,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -771,12 +790,16 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -807,7 +830,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -966,14 +989,19 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, bool exclusive, bool adaptive,
-      framework::Tensor* input_grad, PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -997,7 +1025,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -1051,10 +1079,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     int input_idx = (d * input_height + h) * input_width + w;
                     int output_idx =
                         (pd * output_height + ph) * output_width + pw;
-                    pool_grad_process.compute(
-                        input_data[input_idx], output_data[output_idx],
-                        output_grad_data[output_idx], static_cast<T>(scale),
-                        input_grad_data + input_idx);
+                    pool_grad_process.compute(input_data[input_idx],
+                                              output_data[output_idx],
+                                              output_grad_data[output_idx],
+                                              static_cast<T>(scale),
+                                              input_grad_data + input_idx);
                   }
                 }
               }
@@ -1068,13 +1097,18 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad,
-      PoolProcess pool_grad_process) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_grad_process) {
     bool channel_last = (data_format == "NDHWC");
 
     const int batch_size = input.dims()[0];
@@ -1105,7 +1139,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int dstart = 0, dend = 1;
     int hstart = 0, hend = 1;
@@ -1164,10 +1198,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       int input_idx = (d * input_height + h) * input_width + w;
                       int output_idx =
                           (pd * output_height + ph) * output_width + pw;
-                      pool_grad_process.compute(
-                          input_data[input_idx], output_data[output_idx],
-                          output_grad_data[output_idx], static_cast<T>(scale),
-                          input_grad_data + input_idx);
+                      pool_grad_process.compute(input_data[input_idx],
+                                                output_data[output_idx],
+                                                output_grad_data[output_idx],
+                                                static_cast<T>(scale),
+                                                input_grad_data + input_idx);
                     }
                   }
                 }
@@ -1241,10 +1276,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                           ((pd * output_height + ph) * output_width + pw) *
                               output_channels +
                           c;
-                      pool_grad_process.compute(
-                          input_data[input_idx], output_data[output_idx],
-                          output_grad_data[output_idx], static_cast<T>(scale),
-                          input_grad_data + input_idx);
+                      pool_grad_process.compute(input_data[input_idx],
+                                                output_data[output_idx],
+                                                output_grad_data[output_idx],
+                                                static_cast<T>(scale),
+                                                input_grad_data + input_idx);
                     }
                   }
                 }
@@ -1270,13 +1306,16 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 * height_up, height_down, width_left and width_right, respectively.
 */
 template <class T>
-class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
+class MaxPool3dGradFunctor<CPUContext, T> {
  public:
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -1300,7 +1339,7 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1342,12 +1381,15 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
       }
     }
   }
-  void operator()(
-      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-      const framework::Tensor& output, const framework::Tensor& output_grad,
-      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::string data_format,
-      framework::Tensor* input_grad) {
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1378,7 +1420,7 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
@@ -1475,29 +1517,17 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
-
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUDeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
+template class MaxPool3dGradFunctor<CPUContext, float>;
+template class MaxPool3dGradFunctor<CPUContext, double>;
+
+template class Pool3dFunctor<CPUContext, MaxPool<float>, float>;
+template class Pool3dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool3dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<CPUContext, MaxPool<double>, double>;
+template class Pool3dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool3dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
  * All tensors are in NCHW format.
@@ -1505,13 +1535,16 @@ template class Pool3dGradFunctor<platform::CPUDeviceContext,
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool2dWithIndexFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -1528,8 +1561,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     const int output_stride = output_height * output_width;
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int hstart, hend;
     int wstart, wend;
@@ -1583,14 +1616,16 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_height = input_grad->dims()[2];
     const int input_width = input_grad->dims()[3];
@@ -1602,7 +1637,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1622,14 +1657,10 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
+template class MaxPool2dWithIndexFunctor<CPUContext, float, int>;
+template class MaxPool2dWithIndexGradFunctor<CPUContext, float, int>;
+template class MaxPool2dWithIndexFunctor<CPUContext, double, int>;
+template class MaxPool2dWithIndexGradFunctor<CPUContext, double, int>;
 
 /*
  * All tensors are in NCDHW format.
@@ -1637,13 +1668,16 @@ template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool3dWithIndexFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -1665,8 +1699,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int dstart, dend;
     int hstart, hend;
@@ -1735,14 +1769,16 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<CPUContext, T1, T2> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const CPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_depth = input_grad->dims()[2];
     const int input_height = input_grad->dims()[3];
@@ -1756,7 +1792,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -1779,14 +1815,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
-                                             int>;
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template class MaxPool3dWithIndexFunctor<CPUContext, float, int>;
+template class MaxPool3dWithIndexGradFunctor<CPUContext, float, int>;
+template class MaxPool3dWithIndexFunctor<CPUContext, double, int>;
+template class MaxPool3dWithIndexGradFunctor<CPUContext, double, int>;
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/pooling.cu
rename to paddle/phi/kernels/funcs/pooling.cu
index 9d96345eb1f6dca6fc5eb6cf5847baaf1a9019da..4cf5e1c02c59757ee8bd0ae91c18d0882b702da1 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,63 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/pooling.h"
+
 #include <algorithm>
 #include <vector>
-
-#include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {
 
 struct FastDivModForPooling {
  public:
-  platform::FastDivMod channel;
-  platform::FastDivMod width;
-  platform::FastDivMod height;
+  paddle::platform::FastDivMod channel;
+  paddle::platform::FastDivMod width;
+  paddle::platform::FastDivMod height;
 
   explicit HOSTDEVICE FastDivModForPooling(const int channels,
                                            const int output_width,
                                            const int output_height) {
-    channel = platform::FastDivMod(channels);
-    width = platform::FastDivMod(output_width);
-    height = platform::FastDivMod(output_height);
+    channel = paddle::platform::FastDivMod(channels);
+    width = paddle::platform::FastDivMod(output_width);
+    height = paddle::platform::FastDivMod(output_height);
   }
 };
 
 struct FastDivModForPoolingWithMoreStaff {
  public:
-  platform::FastDivMod channel;
-  platform::FastDivMod width;
-  platform::FastDivMod height;
-  platform::FastDivMod ksize_w;
-  platform::FastDivMod ksize_h;
-  platform::FastDivMod stride_w;
-  platform::FastDivMod stride_h;
+  paddle::platform::FastDivMod channel;
+  paddle::platform::FastDivMod width;
+  paddle::platform::FastDivMod height;
+  paddle::platform::FastDivMod ksize_w;
+  paddle::platform::FastDivMod ksize_h;
+  paddle::platform::FastDivMod stride_w;
+  paddle::platform::FastDivMod stride_h;
 
   explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff(
-      const int channels, const int input_width, const int input_height,
-      const int ksize_width, const int ksize_height, const int stride_width,
+      const int channels,
+      const int input_width,
+      const int input_height,
+      const int ksize_width,
+      const int ksize_height,
+      const int stride_width,
       const int stride_height) {
-    channel = platform::FastDivMod(channels);
-    width = platform::FastDivMod(input_width);
-    height = platform::FastDivMod(input_height);
-    ksize_w = platform::FastDivMod(ksize_width);
-    ksize_h = platform::FastDivMod(ksize_height);
-    stride_w = platform::FastDivMod(stride_width);
-    stride_h = platform::FastDivMod(stride_height);
+    channel = paddle::platform::FastDivMod(channels);
+    width = paddle::platform::FastDivMod(input_width);
+    height = paddle::platform::FastDivMod(input_height);
+    ksize_w = paddle::platform::FastDivMod(ksize_width);
+    ksize_h = paddle::platform::FastDivMod(ksize_height);
+    stride_w = paddle::platform::FastDivMod(stride_width);
+    stride_h = paddle::platform::FastDivMod(stride_height);
   }
 };
 
 template <typename FastDivModForPooling>
-__device__ void OffsetPreparationFor4Dimension(
-    int index, bool channel_last, FastDivModForPooling divmods,
-    const int pad_width, const int pad_height, const int aux_width,
-    const int aux_height, int* w_offset, int* h_offset, int* c_offset,
-    int* stride) {
+__device__ void OffsetPreparationFor4Dimension(int index,
+                                               bool channel_last,
+                                               FastDivModForPooling divmods,
+                                               const int pad_width,
+                                               const int pad_height,
+                                               const int aux_width,
+                                               const int aux_height,
+                                               int* w_offset,
+                                               int* h_offset,
+                                               int* c_offset,
+                                               int* stride) {
   if (!channel_last) { /* NCHW */
     auto input_width_divmod = divmods.width.Divmod(index);
     auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]);
@@ -91,21 +100,40 @@ __device__ void OffsetPreparationFor4Dimension(
 }
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, FastDivModForPooling divmods,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
-    bool channel_last = false) {
+__global__ void KernelPool2D(const int nthreads,
+                             const T* input_data,
+                             const int channels,
+                             const int input_height,
+                             const int input_width,
+                             const int output_height,
+                             const int output_width,
+                             const int ksize_height,
+                             const int ksize_width,
+                             const int stride_height,
+                             const int stride_width,
+                             const int padding_height,
+                             const int padding_width,
+                             FastDivModForPooling divmods,
+                             PoolProcess pool_process,
+                             bool exclusive,
+                             bool adaptive,
+                             T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int hstart, hend, wstart, wend;
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, channel_last, divmods, 0, 0, input_width, input_height,
-        &w_offset, &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         channel_last,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
 
     if (adaptive) {
@@ -139,25 +167,43 @@ __global__ void KernelPool2D(
 }
 
 template <typename T, typename PoolProcess>
-__global__ void KernelPool2DGrad(
-    const int nthreads, const T* __restrict__ input_data,
-    const T* __restrict__ output_data, const const T* __restrict__ output_grad,
-    const int output_width, const int output_height, const int input_width,
-    const int input_height, const int ksize_width, const int ksize_height,
-    const int stride_width, const int stride_height, const int padding_width,
-    const int padding_height, FastDivModForPoolingWithMoreStaff divmods,
-    PoolProcess pool_process, bool exclusive, bool adaptive,
-    T* __restrict__ input_grad, bool channel_last = false) {
+__global__ void KernelPool2DGrad(const int nthreads,
+                                 const T* __restrict__ input_data,
+                                 const T* __restrict__ output_data,
+                                 const const T* __restrict__ output_grad,
+                                 const int output_width,
+                                 const int output_height,
+                                 const int input_width,
+                                 const int input_height,
+                                 const int ksize_width,
+                                 const int ksize_height,
+                                 const int stride_width,
+                                 const int stride_height,
+                                 const int padding_width,
+                                 const int padding_height,
+                                 FastDivModForPoolingWithMoreStaff divmods,
+                                 PoolProcess pool_process,
+                                 bool exclusive,
+                                 bool adaptive,
+                                 T* __restrict__ input_grad,
+                                 bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     T input = static_cast<T>(0);
     T input_grad_data = static_cast<T>(0);
     int phstart, phend, pwstart, pwend;
     int w_offset, h_offset, c_offset, output_offset;
-    OffsetPreparationFor4Dimension<>(index, channel_last, divmods,
-                                     padding_width, padding_height,
-                                     output_width, output_height, &w_offset,
-                                     &h_offset, &c_offset, &output_offset);
+    OffsetPreparationFor4Dimension<>(index,
+                                     channel_last,
+                                     divmods,
+                                     padding_width,
+                                     padding_height,
+                                     output_width,
+                                     output_height,
+                                     &w_offset,
+                                     &h_offset,
+                                     &c_offset,
+                                     &output_offset);
     if (pool_process.use_x) {
       input = input_data[index];
       output_data += output_offset;
@@ -188,7 +234,9 @@ __global__ void KernelPool2DGrad(
                            : tmp_idx;
           T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                              : static_cast<T>(0);
-          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+          pool_process.compute(input,
+                               ouput_value,
+                               output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size),
                                &input_grad_data);
         }
@@ -217,9 +265,11 @@ __global__ void KernelPool2DGrad(
                              : tmp_idx;
             T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                                : static_cast<T>(0);
-            pool_process.compute(
-                input, ouput_value, output_grad[output_sub_idx],
-                static_cast<T>(1.0 / pool_size), &input_grad_data);
+            pool_process.compute(input,
+                                 ouput_value,
+                                 output_grad[output_sub_idx],
+                                 static_cast<T>(1.0 / pool_size),
+                                 &input_grad_data);
           }
         }
       } else {
@@ -232,9 +282,11 @@ __global__ void KernelPool2DGrad(
                              : tmp_idx;
             T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                                : static_cast<T>(0);
-            pool_process.compute(
-                input, ouput_value, output_grad[output_sub_idx],
-                static_cast<T>(1.0 / pool_size), &input_grad_data);
+            pool_process.compute(input,
+                                 ouput_value,
+                                 output_grad[output_sub_idx],
+                                 static_cast<T>(1.0 / pool_size),
+                                 &input_grad_data);
           }
         }
       }
@@ -244,19 +296,38 @@ __global__ void KernelPool2DGrad(
 }
 
 template <typename T>
-__global__ void KernelMaxPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_height,
-    const int input_width, const int output_height, const int output_width,
-    const int ksize_height, const int ksize_width, const int stride_height,
-    const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad, FastDivModForPooling divmods, bool channel_last = false) {
+__global__ void KernelMaxPool2DGrad(const int nthreads,
+                                    const T* input_data,
+                                    const T* output_data,
+                                    const T* output_grad,
+                                    const int channels,
+                                    const int input_height,
+                                    const int input_width,
+                                    const int output_height,
+                                    const int output_width,
+                                    const int ksize_height,
+                                    const int ksize_width,
+                                    const int stride_height,
+                                    const int stride_width,
+                                    const int padding_height,
+                                    const int padding_width,
+                                    T* input_grad,
+                                    FastDivModForPooling divmods,
+                                    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, channel_last, divmods, 0, 0, input_width, input_height,
-        &w_offset, &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         channel_last,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
     input_grad += input_offset;
 
@@ -285,17 +356,24 @@ __global__ void KernelMaxPool2DGrad(
 
     if (maxIndex != -1) {
       // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+      paddle::platform::CudaAtomicAdd(input_grad + maxIndex,
+                                      output_grad[index]);
     }
   }
 }
 
 template <typename PoolProcess, typename T>
 void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    const T* input,
+    const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape,
+    const std::vector<int>& ksize,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    bool exclusive,
+    bool adaptive,
+    T* output,
+    gpuStream_t stream,
     PoolProcess pool_compute) {
   const int batch_size = input_shape[0];
   const int input_channels = input_shape[1];
@@ -314,7 +392,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   int nthreads = batch_size * output_channels * output_height * output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-  // platform::ChangeThreadNum(context, &thread_num);
+  // paddle::platform::ChangeThreadNum(context, &thread_num);
   thread_num = 512;
 #endif
   int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -323,11 +401,24 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 
   auto pool_divmods =
       FastDivModForPooling(input_channels, output_width, output_height);
-  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_height, input_width, output_height,
-      output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_divmods, pool_compute, exclusive,
-      adaptive, output);
+  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(nthreads,
+                                                             input,
+                                                             input_channels,
+                                                             input_height,
+                                                             input_width,
+                                                             output_height,
+                                                             output_width,
+                                                             ksize_height,
+                                                             ksize_width,
+                                                             stride_height,
+                                                             stride_width,
+                                                             padding_height,
+                                                             padding_width,
+                                                             pool_divmods,
+                                                             pool_compute,
+                                                             exclusive,
+                                                             adaptive,
+                                                             output);
 }
 
 /*
@@ -338,13 +429,16 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
  * height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -361,12 +455,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    paddle::platform::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -375,17 +469,35 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_divmods, pool_process,
-        exclusive, adaptive, output_data);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        pool_divmods,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
     const int batch_size = input.dims()[0];
 
@@ -410,12 +522,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    paddle::platform::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -424,10 +536,25 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_divmods, pool_process,
-        exclusive, adaptive, output_data, channel_last);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        pool_divmods,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data,
+        channel_last);
   }
 };
 /*
@@ -438,16 +565,18 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool2dGradFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -465,30 +594,53 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
-        input_channels, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height);
-
-    auto config = GetGpuLaunchConfig1D(context, nthreads);
-    KernelPool2DGrad<T, PoolProcess><<<
-        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, output_width,
-        output_height, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height, padding_width, padding_height,
-        pool_divmods, pool_process, exclusive, adaptive, input_grad_data);
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels,
+                                                          input_width,
+                                                          input_height,
+                                                          ksize_width,
+                                                          ksize_height,
+                                                          stride_width,
+                                                          stride_height);
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<config.block_per_grid,
+                                       config.thread_per_block,
+                                       0,
+                                       context.stream()>>>(nthreads,
+                                                           input_data,
+                                                           output_data,
+                                                           output_grad_data,
+                                                           output_width,
+                                                           output_height,
+                                                           input_width,
+                                                           input_height,
+                                                           ksize_width,
+                                                           ksize_height,
+                                                           stride_width,
+                                                           stride_height,
+                                                           padding_width,
+                                                           padding_height,
+                                                           pool_divmods,
+                                                           pool_process,
+                                                           exclusive,
+                                                           adaptive,
+                                                           input_grad_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -514,21 +666,41 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
-        input_channels, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height);
-
-    auto config = GetGpuLaunchConfig1D(context, nthreads);
-    KernelPool2DGrad<T, PoolProcess><<<
-        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, output_width,
-        output_height, input_width, input_height, ksize_width, ksize_height,
-        stride_width, stride_height, padding_width, padding_height,
-        pool_divmods, pool_process, exclusive, adaptive, input_grad_data,
-        channel_last);
+    auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels,
+                                                          input_width,
+                                                          input_height,
+                                                          ksize_width,
+                                                          ksize_height,
+                                                          stride_width,
+                                                          stride_height);
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<config.block_per_grid,
+                                       config.thread_per_block,
+                                       0,
+                                       context.stream()>>>(nthreads,
+                                                           input_data,
+                                                           output_data,
+                                                           output_grad_data,
+                                                           output_width,
+                                                           output_height,
+                                                           input_width,
+                                                           input_height,
+                                                           ksize_width,
+                                                           ksize_height,
+                                                           stride_width,
+                                                           stride_height,
+                                                           padding_width,
+                                                           padding_height,
+                                                           pool_divmods,
+                                                           pool_process,
+                                                           exclusive,
+                                                           adaptive,
+                                                           input_grad_data,
+                                                           channel_last);
   }
 };
 
@@ -540,16 +712,16 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_down, width_left and width_right, respectively.
  */
 template <typename T>
-class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
+class MaxPool2dGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -567,7 +739,7 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -577,17 +749,33 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, pool_divmods);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        pool_divmods);
   }
-  void operator()(
-      const platform::CUDADeviceContext& context,
-      const framework::Tensor& input, const framework::Tensor& output,
-      const framework::Tensor& output_grad, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string data_format, framework::Tensor* input_grad) {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NHWC");
 
     const int batch_size = input.dims()[0];
@@ -614,7 +802,7 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -625,71 +813,80 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
         FastDivModForPooling(input_channels, output_width, output_height);
 
     KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_height, input_width, output_height, output_width, ksize_height,
-        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        input_grad_data, pool_divmods, channel_last);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        pool_divmods,
+        channel_last);
   }
 };
 
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxPool2dGradFunctor<platform::CUDADeviceContext,
-                                    paddle::platform::float16>;
-
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template class Pool2dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool2dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
+template class Pool2dDirectCUDAFunctor<MaxPool<float>, float>;
+template class Pool2dDirectCUDAFunctor<AvgPool<float>, float>;
+
+template class MaxPool2dGradFunctor<phi::GPUContext, float>;
+template class MaxPool2dGradFunctor<phi::GPUContext, double>;
+template class MaxPool2dGradFunctor<phi::GPUContext, dtype::float16>;
+
+template class Pool2dFunctor<phi::GPUContext, MaxPool<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, MaxPool<double>, double>;
+template class Pool2dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+
+template class Pool2dFunctor<phi::GPUContext,
+                             MaxPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool2dFunctor<phi::GPUContext,
+                             AvgPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 MaxPoolGrad<dtype::float16>,
+                                 dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 AvgPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
-    bool channel_last = false) {
+__global__ void KernelPool3D(const int nthreads,
+                             const T* input_data,
+                             const int channels,
+                             const int input_depth,
+                             const int input_height,
+                             const int input_width,
+                             const int output_depth,
+                             const int output_height,
+                             const int output_width,
+                             const int ksize_depth,
+                             const int ksize_height,
+                             const int ksize_width,
+                             const int stride_depth,
+                             const int stride_height,
+                             const int stride_width,
+                             const int padding_depth,
+                             const int padding_height,
+                             const int padding_width,
+                             PoolProcess pool_process,
+                             bool exclusive,
+                             bool adaptive,
+                             T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw, ph, pd, c, batch_idx;
@@ -764,16 +961,31 @@ __global__ void KernelPool3D(
 }
 
 template <typename T, typename PoolProcess>
-__global__ void KernelPool3DGrad(
-    const int nthreads, const T* __restrict__ input_data,
-    const T* __restrict__ output_data, const T* __restrict__ output_grad,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, PoolProcess pool_process, bool exclusive,
-    bool adaptive, T* input_grad, bool channel_last = false) {
+__global__ void KernelPool3DGrad(const int nthreads,
+                                 const T* __restrict__ input_data,
+                                 const T* __restrict__ output_data,
+                                 const T* __restrict__ output_grad,
+                                 const int channels,
+                                 const int input_depth,
+                                 const int input_height,
+                                 const int input_width,
+                                 const int output_depth,
+                                 const int output_height,
+                                 const int output_width,
+                                 const int ksize_depth,
+                                 const int ksize_height,
+                                 const int ksize_width,
+                                 const int stride_depth,
+                                 const int stride_height,
+                                 const int stride_width,
+                                 const int padding_depth,
+                                 const int padding_height,
+                                 const int padding_width,
+                                 PoolProcess pool_process,
+                                 bool exclusive,
+                                 bool adaptive,
+                                 T* input_grad,
+                                 bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride;
@@ -867,7 +1079,9 @@ __global__ void KernelPool3DGrad(
                   : (pd * output_height + ph) * output_width + pw;
           T ouput_value = pool_process.use_x ? output_data[output_sub_idx]
                                              : static_cast<T>(0);
-          pool_process.compute(input, ouput_value, output_grad[output_sub_idx],
+          pool_process.compute(input,
+                               ouput_value,
+                               output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size),
                                &input_grad_data);
         }
@@ -878,15 +1092,28 @@ __global__ void KernelPool3DGrad(
 }
 
 template <typename T>
-__global__ void KernelMaxPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, const int channels, const int input_depth,
-    const int input_height, const int input_width, const int output_depth,
-    const int output_height, const int output_width, const int ksize_depth,
-    const int ksize_height, const int ksize_width, const int stride_depth,
-    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad,
-    bool channel_last = false) {
+__global__ void KernelMaxPool3DGrad(const int nthreads,
+                                    const T* input_data,
+                                    const T* output_data,
+                                    const T* output_grad,
+                                    const int channels,
+                                    const int input_depth,
+                                    const int input_height,
+                                    const int input_width,
+                                    const int output_depth,
+                                    const int output_height,
+                                    const int output_width,
+                                    const int ksize_depth,
+                                    const int ksize_height,
+                                    const int ksize_width,
+                                    const int stride_depth,
+                                    const int stride_height,
+                                    const int stride_width,
+                                    const int padding_depth,
+                                    const int padding_height,
+                                    const int padding_width,
+                                    T* input_grad,
+                                    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw, ph, pd, c, batch_idx;
@@ -949,17 +1176,23 @@ __global__ void KernelMaxPool3DGrad(
     }
     if (maxIdx != -1) {
       // atomic add
-      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+      paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
     }
   }
 }
 
 template <typename PoolProcess, typename T>
 void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
-    const T* input, const std::vector<int>& input_shape,
-    const std::vector<int>& output_shape, const std::vector<int>& ksize,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    const T* input,
+    const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape,
+    const std::vector<int>& ksize,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    bool exclusive,
+    bool adaptive,
+    T* output,
+    gpuStream_t stream,
     PoolProcess pool_compute) {
   const int batch_size = input_shape[0];
   const int input_channels = input_shape[1];
@@ -990,11 +1223,28 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
   dim3 threads(thread_num, 1);
   dim3 grid(blocks, 1);
 
-  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(
-      nthreads, input, input_channels, input_depth, input_height, input_width,
-      output_depth, output_height, output_width, ksize_depth, ksize_height,
-      ksize_width, stride_depth, stride_height, stride_width, padding_depth,
-      padding_height, padding_width, pool_compute, exclusive, adaptive, output);
+  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(nthreads,
+                                                             input,
+                                                             input_channels,
+                                                             input_depth,
+                                                             input_height,
+                                                             input_width,
+                                                             output_depth,
+                                                             output_height,
+                                                             output_width,
+                                                             ksize_depth,
+                                                             ksize_height,
+                                                             ksize_width,
+                                                             stride_depth,
+                                                             stride_height,
+                                                             stride_width,
+                                                             padding_depth,
+                                                             padding_height,
+                                                             padding_width,
+                                                             pool_compute,
+                                                             exclusive,
+                                                             adaptive,
+                                                             output);
 }
 
 /*
@@ -1006,13 +1256,16 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* output,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1034,31 +1287,52 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    paddle::platform::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data);
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* output, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1089,24 +1363,42 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    paddle::platform::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
     KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        adaptive, output_data, channel_last);
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        output_data,
+        channel_last);
   }
 };
 
@@ -1119,16 +1411,18 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool exclusive,
-                  bool adaptive, framework::Tensor* input_grad,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
                   PoolProcess pool_process) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1152,7 +1446,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1161,21 +1455,43 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 grid(blocks, 1);
 
     KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        input_grad_data);
   }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::string data_format, bool exclusive, bool adaptive,
-                  framework::Tensor* input_grad, PoolProcess pool_process) {
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_process) {
     bool channel_last = (data_format == "NDHWC");
 
     const int batch_size = input.dims()[0];
@@ -1206,7 +1522,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1215,11 +1531,30 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     dim3 grid(blocks, 1);
 
     KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, adaptive, input_grad_data,
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        pool_process,
+        exclusive,
+        adaptive,
+        input_grad_data,
         channel_last);  // add channel_last
   }
 };
@@ -1233,16 +1568,16 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  * height_up, height_down, width_left and width_right, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
+class MaxPool3dGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+                  DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1265,7 +1600,7 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -1274,18 +1609,37 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data);
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        input_grad_data);
   }
-  void operator()(
-      const platform::CUDADeviceContext& context,
-      const framework::Tensor& input, const framework::Tensor& output,
-      const framework::Tensor& output_grad, const std::vector<int>& ksize,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::string data_format, framework::Tensor* input_grad) {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad) {
     bool channel_last = (data_format == "NDHWC");
     const int batch_size = input.dims()[0];
 
@@ -1316,7 +1670,7 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -1325,77 +1679,93 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, input_grad_data, channel_last);  // add channel_last
+        nthreads,
+        input_data,
+        output_data,
+        output_grad_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        input_grad_data,
+        channel_last);  // add channel_last
   }
 };
 
-template class Pool3dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
-                                       float>;
-template class Pool3dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
-                                       float>;
-
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
-template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
-                                    paddle::platform::float16>;
-
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::AvgPoolGrad<double>,
-                                 double>;
-
-template class Pool3dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPool<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
-template class Pool3dGradFunctor<
-    platform::CUDADeviceContext,
-    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
-    paddle::platform::float16>;
+template class Pool3dDirectCUDAFunctor<MaxPool<float>, float>;
+template class Pool3dDirectCUDAFunctor<AvgPool<float>, float>;
+
+template class MaxPool3dGradFunctor<phi::GPUContext, float>;
+template class MaxPool3dGradFunctor<phi::GPUContext, double>;
+template class MaxPool3dGradFunctor<phi::GPUContext, dtype::float16>;
+
+template class Pool3dFunctor<phi::GPUContext, MaxPool<float>, float>;
+template class Pool3dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool3dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<phi::GPUContext, MaxPool<double>, double>;
+template class Pool3dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool3dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+
+template class Pool3dFunctor<phi::GPUContext,
+                             MaxPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool3dFunctor<phi::GPUContext,
+                             AvgPool<dtype::float16>,
+                             dtype::float16>;
+template class Pool3dGradFunctor<phi::GPUContext,
+                                 MaxPoolGrad<dtype::float16>,
+                                 dtype::float16>;
+template class Pool3dGradFunctor<phi::GPUContext,
+                                 AvgPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, bool adaptive, T1* output_data, T2* mask_data,
-    FastDivModForPooling divmods) {
+__global__ void KernelMaxPool2dWithIdx(const int nthreads,
+                                       const T1* input_data,
+                                       const int channels,
+                                       const int input_height,
+                                       const int input_width,
+                                       const int output_height,
+                                       const int output_width,
+                                       const int ksize_height,
+                                       const int ksize_width,
+                                       const int stride_height,
+                                       const int stride_width,
+                                       const int padding_height,
+                                       const int padding_width,
+                                       bool adaptive,
+                                       T1* output_data,
+                                       T2* mask_data,
+                                       FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int hstart, hend, wstart, wend;
     int w_offset, h_offset, c_offset, input_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, false, divmods, 0, 0, input_width, input_height, &w_offset,
-        &h_offset, &c_offset, &input_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         false,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         input_width,
+                                                         input_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &input_offset);
     input_data += input_offset;
 
     if (adaptive) {
@@ -1431,20 +1801,38 @@ __global__ void KernelMaxPool2dWithIdx(
 }
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, bool adaptive,
-    T1* input_grad, FastDivModForPooling divmods) {
+__global__ void KernelMaxPool2DWithIdxGrad(const int nthreads,
+                                           const T1* output_grad,
+                                           const T2* mask_data,
+                                           const int channels,
+                                           const int input_height,
+                                           const int input_width,
+                                           const int output_height,
+                                           const int output_width,
+                                           const int ksize_height,
+                                           const int ksize_width,
+                                           const int stride_height,
+                                           const int stride_width,
+                                           const int padding_height,
+                                           const int padding_width,
+                                           bool adaptive,
+                                           T1* input_grad,
+                                           FastDivModForPooling divmods) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int phstart, phend, pwstart, pwend;
     int w_offset, h_offset, c_offset, output_offset;
-    OffsetPreparationFor4Dimension<FastDivModForPooling>(
-        index, false, divmods, 0, 0, output_width, output_height, &w_offset,
-        &h_offset, &c_offset, &output_offset);
+    OffsetPreparationFor4Dimension<FastDivModForPooling>(index,
+                                                         false,
+                                                         divmods,
+                                                         0,
+                                                         0,
+                                                         output_width,
+                                                         output_height,
+                                                         &w_offset,
+                                                         &h_offset,
+                                                         &c_offset,
+                                                         &output_offset);
     mask_data += output_offset;
     output_grad += output_offset;
 
@@ -1487,13 +1875,16 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -1509,13 +1900,13 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     const int padding_width = paddings[1];
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    paddle::platform::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -1525,10 +1916,23 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width,
-        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, adaptive, output_data,
-        mask_data, pool_divmods);
+        nthreads,
+        input_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        adaptive,
+        output_data,
+        mask_data,
+        pool_divmods);
   }
 };
 
@@ -1538,14 +1942,16 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
     const int input_height = input_grad->dims()[2];
@@ -1561,7 +1967,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
 
     const T2* mask_data = mask.data<T2>();
     const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -1571,31 +1977,53 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, input_width, input_height);
     KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_height,
-        input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width, adaptive,
-        input_grad_data, pool_divmods);
+        nthreads,
+        output_grad_data,
+        mask_data,
+        input_channels,
+        input_height,
+        input_width,
+        output_height,
+        output_width,
+        ksize_height,
+        ksize_width,
+        stride_height,
+        stride_width,
+        padding_height,
+        padding_width,
+        adaptive,
+        input_grad_data,
+        pool_divmods);
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, float, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, float, int>;
+template class MaxPool2dWithIndexFunctor<phi::GPUContext, double, int>;
+template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, double, int>;
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T1* input_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    bool adaptive, T1* output_data, T2* mask_data) {
+__global__ void KernelMaxPool3DWithIdx(const int nthreads,
+                                       const T1* input_data,
+                                       const int channels,
+                                       const int input_depth,
+                                       const int input_height,
+                                       const int input_width,
+                                       const int output_depth,
+                                       const int output_height,
+                                       const int output_width,
+                                       const int ksize_depth,
+                                       const int ksize_height,
+                                       const int ksize_width,
+                                       const int stride_depth,
+                                       const int stride_height,
+                                       const int stride_width,
+                                       const int padding_depth,
+                                       const int padding_height,
+                                       const int padding_width,
+                                       bool adaptive,
+                                       T1* output_data,
+                                       T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -1650,14 +2078,27 @@ __global__ void KernelMaxPool3DWithIdx(
 }
 
 template <typename T1, typename T2>
-__global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T1* output_grad, const T2* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, bool adaptive, T1* input_grad) {
+__global__ void KernelMaxPool3DWithIdxGrad(const int nthreads,
+                                           const T1* output_grad,
+                                           const T2* mask,
+                                           const int channels,
+                                           const int input_depth,
+                                           const int input_height,
+                                           const int input_width,
+                                           const int output_depth,
+                                           const int output_height,
+                                           const int output_width,
+                                           const int ksize_depth,
+                                           const int ksize_height,
+                                           const int ksize_width,
+                                           const int stride_depth,
+                                           const int stride_height,
+                                           const int stride_width,
+                                           const int padding_depth,
+                                           const int padding_height,
+                                           const int padding_width,
+                                           bool adaptive,
+                                           T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -1727,13 +2168,16 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1754,14 +2198,14 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     const int padding_width = paddings[2];
 
     const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(context.GetPlace());
-    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+    T1* output_data = context.template Alloc<T1>(output);
+    T2* mask_data = context.template Alloc<T2>(mask);
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    platform::ChangeThreadNum(context, &thread_num);
+    paddle::platform::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -1769,10 +2213,26 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_depth, input_height,
-        input_width, output_depth, output_height, output_width, ksize_depth,
-        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, adaptive, output_data,
+        nthreads,
+        input_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        adaptive,
+        output_data,
         mask_data);
   }
 };
@@ -1783,14 +2243,16 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, const std::vector<int>& ksize,
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, bool adaptive,
-                  framework::Tensor* input_grad) {
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
     const int input_depth = input_grad->dims()[2];
@@ -1811,7 +2273,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
 
     const T1* output_grad_data = output_grad.data<T1>();
     const T2* mask_data = mask.data<T2>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+    T1* input_grad_data = context.template Alloc<T1>(input_grad);
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1820,23 +2282,34 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads, output_grad_data, mask_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width, adaptive,
+        nthreads,
+        output_grad_data,
+        mask_data,
+        input_channels,
+        input_depth,
+        input_height,
+        input_width,
+        output_depth,
+        output_height,
+        output_width,
+        ksize_depth,
+        ksize_height,
+        ksize_width,
+        stride_depth,
+        stride_height,
+        stride_width,
+        padding_depth,
+        padding_height,
+        padding_width,
+        adaptive,
         input_grad_data);
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
-                                         int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
-                                             double, int>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, float, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, float, int>;
+template class MaxPool3dWithIndexFunctor<phi::GPUContext, double, int>;
+template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, double, int>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..19c6d52c4c9018f821c4e7f6ddaebf933aa045e8
--- /dev/null
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -0,0 +1,469 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/macros.h"  // import FLT_MAX
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) {}
+};
+
+template <class T>
+class AvgPool {
+  using MT = typename dtype::MPTypeTrait<T>::Type;
+  MT intermediate_res;
+
+ public:
+  DEVICE inline T initial() {
+    intermediate_res = static_cast<MT>(0.0f);
+    return static_cast<T>(0);
+  }
+
+  DEVICE inline void compute(const T& x, T* y) {
+    intermediate_res += static_cast<MT>(x);
+  }
+
+  DEVICE inline void finalize(const T& pool_field, T* y) {
+    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
+  }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  static constexpr bool use_x = true;
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += dy * static_cast<T>(x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  static constexpr bool use_x = false;
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale, T* dx) {
+    *dx += (scale * dy);
+  }
+};
+
+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
+ * is the number of channels, H and W is the height and width of feature.
+ * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
+ * is the number of channels, D, H and W is the depth, height and width of
+ * feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool2dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input,
+                  const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  T* output,
+                  gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool3dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input,
+                  const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  T* output,
+                  gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* output,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  bool exclusive,
+                  bool adaptive,
+                  DenseTensor* input_grad,
+                  PoolProcess pool_compute);
+};
+
+template <typename Context, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  DenseTensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format,
+                  DenseTensor* input_grad);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename Context, typename T1, typename T2>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* output,
+                  DenseTensor* mask);
+};
+
+template <typename Context, typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const Context& context,
+                  const DenseTensor& output_grad,
+                  const DenseTensor& mask,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool adaptive,
+                  DenseTensor* input_grad);
+};
+
+inline int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int padding_1,
+                          int padding_2,
+                          int stride,
+                          bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
+            stride +
+        1;
+  }
+  PADDLE_ENFORCE_GT(
+      output_size,
+      0,
+      errors::InvalidArgument(
+          "the output size must be greater than 0. But received: "
+          "output_size = %d due to the settings of input_size(%d), "
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size,
+          input_size,
+          padding_1,
+          padding_2,
+          filter_size,
+          stride));
+  return output_size;
+}
+
+inline int MaxPoolOutputSize(int input_size,
+                             int filter_size,
+                             int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+template <typename T = int>
+inline void UpdatePadding(std::vector<T>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const DDim data_dims,
+                          const std::vector<T>& strides,
+                          const std::vector<T>& kernel_size) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2,
+                      paddings->size(),
+                      errors::InvalidArgument(
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(),
+                          data_dims.size() * 2));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + kernel_size[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+template <typename T = int>
+inline void UpdateKernelSize(std::vector<T>* kernel_size,
+                             const DDim data_dims) {
+  kernel_size->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < kernel_size->size(); ++i) {
+    *(kernel_size->begin() + i) = static_cast<T>(data_dims[i]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..5834f091d9a4de02afe7488ededc0189ae6f21d0
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -0,0 +1,1239 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+#include "paddle/utils/string/string_helper.h"
+
+// Reduce split or not, Whether to use ReduceHigherDim
+#define REDUCE_SPLIT_BOUNDARY 512
+#define REDUCE_VEC_SIZE 4
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+
+namespace details {
+
+static inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
+
+// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
+static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
+                                             const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+// get blockDim for reduceLastDim and reduceAny
+static inline int GetBlockDim(int block_dim) {
+  return block_dim >= kps::details::kReduceMaxThread
+             ? kps::details::kReduceMaxThread
+             : GetLastPow2(block_dim);
+}
+
+// check reduce rand is valid
+static inline void CheckReduceRank(int reduce_rank, int rank) {
+  if (rank % 2 == 0) {
+    PADDLE_ENFORCE_EQ(reduce_rank,
+                      rank / 2,
+                      phi::errors::InvalidArgument(
+                          "ReduceOp: invalid reduce rank. When rank = %d, "
+                          "reduce_rank must be %d, but got %d.",
+                          rank,
+                          rank / 2,
+                          reduce_rank));
+  } else {
+    auto lower_rank = (rank - 1) / 2;
+    auto upper_rank = (rank + 1) / 2;
+    PADDLE_ENFORCE_EQ(
+        reduce_rank == lower_rank || reduce_rank == upper_rank,
+        true,
+        phi::errors::InvalidArgument(
+            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
+            "must be %d or %d, but got %d.",
+            rank,
+            lower_rank,
+            upper_rank,
+            reduce_rank));
+  }
+}
+
+// convert dims from vector to array
+template <typename T, size_t ElementCount, typename VectorLikeType>
+static inline phi::Array<T, ElementCount> VectorToArray(
+    const VectorLikeType& vec) {
+  PADDLE_ENFORCE_LE(
+      vec.size(),
+      ElementCount,
+      phi::errors::InvalidArgument("Cub reduce Array: size not match. Received "
+                                   "vec.size() %d > ElementCount %d.",
+                                   vec.size(),
+                                   ElementCount));
+  size_t n = static_cast<size_t>(vec.size());
+  phi::Array<T, ElementCount> ret;
+  for (size_t i = 0; i < n; ++i) {
+    ret[i] = vec[i];
+  }
+  return ret;
+}
+
+static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
+                                            int dim_size,
+                                            bool reduce_all) {
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    reduce_dims.resize(dim_size);
+    int reduce_size = reduce_dims.size();
+    for (int i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = i;
+    }
+  } else {
+    for (auto e : dims) {
+      PADDLE_ENFORCE_LT(e,
+                        dim_size,
+                        phi::errors::InvalidArgument(
+                            "ReduceOp: invalid axis, when x_dims is %d, "
+                            "axis[i] should less than x_dims, but got %d.",
+                            dim_size,
+                            e));
+      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
+    }
+  }
+  return reduce_dims;
+}
+
+}  // namespace details
+
+constexpr int kMaxRank = phi::DDim::kMaxRank;
+
+enum ReduceType {
+  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
+  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
+  kReduceAny = 0x03,        // when reduce_dim.size() > 1
+};
+
+struct IndexCalculator {
+  IndexCalculator(int dim,
+                  const std::vector<int>& cal_dims,
+                  const std::vector<int>& cal_strides,
+                  const std::vector<int>& full_strides)
+      : dim(dim) {
+    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
+    strides = details::VectorToArray<int, kMaxRank>(full_strides);
+    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
+#ifndef PADDLE_WITH_XPU_KP
+    std::vector<paddle::platform::FastDivMod> cal_divmoders;
+    // fast divmod
+    for (auto i : cal_strides) {
+      cal_divmoders.push_back(paddle::platform::FastDivMod(i));
+    }
+    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
+        cal_divmoders);
+#endif
+  }
+
+  __device__ inline int operator()(int offset) const {
+#ifdef PADDLE_WITH_XPU_KP
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      index += (offset / reduce_strides[i]) * strides[dims[i]];
+      offset = offset % reduce_strides[i];
+    }
+    return index;
+#else
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      auto divmod = divmoders[i].Divmod(offset);
+      index += (divmod.val[0] * strides[dims[i]]);
+      offset = divmod.val[1];
+    }
+    return index;
+#endif
+  }
+
+  int dim;
+  phi::Array<int, kMaxRank> dims;
+  phi::Array<int, kMaxRank> strides;
+  phi::Array<int, kMaxRank> reduce_strides;
+#ifndef PADDLE_WITH_XPU2
+  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+#endif
+};
+
+template <bool ReduceLastDim = false>
+struct ReduceIndexMapping {
+  const kps::DimConfig dim;
+  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
+      : dim(dims) {}
+
+  __device__ __forceinline__ int BlockIdX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    } else {
+      return cluster_id() % dim.split_num_x;
+    }
+#else
+    return blockIdx.x;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockIdY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() % dim.split_num_x);
+    } else {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    }
+#else
+    return blockIdx.y;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockDimX() {
+#ifdef PADDLE_WITH_XPU2
+    return dim.deal_size_x;
+#else
+    return blockDim.x;
+#endif
+  }
+
+  __device__ __forceinline__ int BlockDimY() {
+#ifdef PADDLE_WITH_XPU2
+    return 1;
+#else
+    return blockDim.y;
+#endif
+  }
+
+  __device__ __forceinline__ int GridDimX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_y;
+    } else {
+      return dim.split_num_x;
+    }
+#else
+    return gridDim.x;
+#endif
+  }
+
+  __device__ __forceinline__ int GridDimY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_x;
+    } else {
+      return dim.split_num_y;
+    }
+#else
+    return gridDim.y;
+#endif
+  }
+
+  __device__ __forceinline__ int GetLoopSize() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.deal_size_y;
+    } else {
+      return dim.deal_size_x;
+    }
+#else
+    return 1;
+#endif
+  }
+};
+
+// when reduce_type == kReduceLastDim this struct will be used
+// for higher performance
+struct OneDimIndexCal {
+  explicit OneDimIndexCal(int num) : stride(num) {}
+
+  __device__ inline int operator()(int index) const { return index * stride; }
+  int stride;
+};
+
+// reduce config
+template <typename Ty>
+struct ReduceConfig {
+  ReduceConfig(const std::vector<int>& origin_reduce_dims,
+               const std::vector<int>& origin_x_dim)
+      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
+
+  // get the parameters of reduceKernel
+  void Run() {
+    // step1: update the reduce_dim left_dim and x_dim
+    SetReduceDim();
+
+    // step2: get the strides of dim for reduceAny and reduceLastDim
+    SetStrides();
+
+    // step3: get the type of reduce
+    SetReduceType();
+
+    // step4: set the block and grid for launch kernel
+    SetBlockDim();
+  }
+
+  // when should_reduce_again is true, we need malloc temp space for temp data
+  void SetOutputData(Ty* y_data,
+                     const phi::GPUContext& dev_ctx,
+                     phi::DenseTensor* tmp) {
+    if (should_reduce_again) {
+      tmp->Resize(phi::make_ddim(
+          {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
+      output_data = dev_ctx.Alloc<Ty>(tmp);
+    } else {
+      output_data = y_data;
+    }
+  }
+
+ private:
+  // set reduce_dim, left_dim and update x_dim
+  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
+  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
+  void SetReduceDim() {
+    std::set<int> reduce_set;
+    for (auto e : reduce_dims_origin) {
+      auto pos = e >= 0 ? e : e + x_dim.size();
+      reduce_set.insert(pos);
+    }
+
+    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
+    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
+
+    // update reduce_dim and x_dim
+    std::vector<int> x_new_dim;
+
+    reduce_dim.push_back(reduce_dim_temp[0]);
+    x_new_dim.push_back(x_dim[0]);
+
+    int idx_reduce = 1;
+    int num = 0;
+
+    if (reduce_dim_temp.size() > 1) {
+      for (int i = 1; i < x_dim.size(); i++) {
+        if ((idx_reduce < reduce_dim_temp.size()) &&
+            (i == reduce_dim_temp[idx_reduce])) {
+          int result =
+              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
+          bool is_equal = ((result - num) == 1);
+          if (is_equal) {
+            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+            num++;
+          } else {
+            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
+            x_new_dim.push_back(x_dim[i]);
+          }
+          idx_reduce++;
+        } else {
+          x_new_dim.push_back(x_dim[i]);
+        }
+      }
+    } else {
+      x_new_dim = x_dim;
+    }
+
+    // update x_dim
+    x_dim = x_new_dim;
+    std::vector<int>().swap(x_new_dim);
+
+    std::vector<int> reduce_dim_new;
+    int is_reduced = 0;
+    for (auto e : reduce_dim) {
+      is_reduced |= 1 << e;
+    }
+
+    std::vector<int>().swap(reduce_dim);
+
+    for (int i = 0; i < x_dim.size(); i++) {
+      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+        x_new_dim.push_back(x_dim[i]);
+        if ((is_reduced >> i) & 1)
+          reduce_dim_new.push_back(x_new_dim.size() - 1);
+      } else {
+        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+      }
+    }
+
+    x_dim = x_new_dim;
+    reduce_dim = reduce_dim_new;
+
+    int x_rank = static_cast<int>(x_dim.size());
+    std::set<int> left_set;
+
+    for (int i = 0; i < x_rank; ++i) {
+      left_set.insert(i);
+    }
+
+    for (auto e : reduce_dim) {
+      left_set.erase(e);
+    }
+
+    left_dim.assign(left_set.begin(), left_set.end());
+
+    // if the last dim gets involved in reduction
+    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
+  }
+
+  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
+  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
+  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
+  //     left_strides = [1]
+  void SetStrides() {
+    std::vector<int> idx_dim;
+    for (int i = 0; i < x_dim.size(); i++) {
+      idx_dim.push_back(i);
+    }
+
+    x_strides = details::GetDimStrides(x_dim, idx_dim);
+    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
+    left_strides = details::GetDimStrides(x_dim, left_dim);
+    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+
+    left_num = 1;
+    if (left_dim.size()) {
+      left_num = left_strides[0] * x_dim[left_dim[0]];
+    }
+  }
+
+  // get the reduceType
+  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
+  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
+  //     x_dim = [8] reduce_dim = [0] --> reduceAll
+  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
+  void SetReduceType() {
+    int rank = x_dim.size();
+    int reduce_rank = reduce_dim.size();
+    bool is_last_dim =
+        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
+    if (rank == reduce_rank || is_last_dim) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
+      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
+    } else if (reduce_rank == 1) {
+// ReduceFirstDim and reduceSecondDim
+#ifdef PADDLE_WITH_XPU_KP
+      if (reduce_dim[0] == 0) {
+        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+      } else {
+        reduce_type = static_cast<int>(ReduceType::kReduceAny);
+      }
+#else
+      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+#endif
+    } else {
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+    }
+  }
+
+#ifndef PADDLE_WITH_XPU_KP
+  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
+    constexpr int min_reduce_num_per_thread = 16;
+    constexpr int max_reduce_num_per_thread = 256;
+    constexpr int max_num_threads = kps::details::kReduceMaxThread;
+
+    // set block size.
+    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
+    //    will process the reduction for one output.
+    //    The number of output for one block is blockDim.y;
+    // 2. If reduce_last_dim == false, different threadIdx.x will process
+    //    different reduction and gets the output separately. If it is
+    //    necessary, it should reduce in block y.
+    //    The number of output for one block is blockDim.x;
+    int block_x, block_y;
+    int grid_num, reduce_num_per_thread;
+    if (reduce_last_dim) {
+      block_x = details::GetBlockDim(reduce_num);
+      block_y = details::GetBlockDim(left_num);
+      block_dim->x = block_x;
+      block_dim->y =
+          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
+      grid_num = details::AlignUp(left_num, block_dim->y);
+      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
+    } else {
+      block_x = details::GetBlockDim(left_num);
+      block_y = details::GetBlockDim(reduce_num);
+      block_dim->x = std::min(block_x, 32);
+      block_dim->y =
+          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
+      block_dim->x =
+          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
+      grid_num = details::AlignUp(left_num, block_dim->x);
+      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
+    }
+    int device_id = paddle::platform::GetCurrentDeviceId();
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int max_threads_per_mp =
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+    int max_threads = max_threads_per_mp * max_mp;
+    int num_threads = block_dim->x * block_dim->y;
+    int max_num_blocks = max_threads / num_threads;
+
+    // set grid size.
+    // Whether to set grid.y larger than 1, there are 3 following rules:
+    // 1. The number that each thread process should no less than
+    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
+    // 2. It should maximize the utilization of SM.
+    // So we choose the minimum between input_split_num_1 and input_split_num_3
+    // to make each thread process as mush data as possible. Meanwhile,
+    // the number cannot be larger than max_reduce_num_per_thread, so we
+    // choose the maximum between the result above and input_split_num_2.
+    int input_split_num_1 =
+        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
+    int input_split_num_2 =
+        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
+    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
+
+    grid_dim->x = grid_num;
+    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
+                           input_split_num_2);
+    // if grid.y > 1, we need launch reduce kernel again.
+    if (grid_dim->y > 1) {
+      should_reduce_again = true;
+    }
+  }
+
+  // set block and grid for launch kernel
+  // for ReduceHigherDim: if block is enough -> splite reduce_num
+  //                     else init block(32, 1) grid(block_num, 1)
+  // for others: block(block_num, 1) , grid(left_num, 1)
+  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
+    int last_dim_num = x_dim.back();
+    // update left_num
+    int grid_z = left_num / last_dim_num;
+    left_num = last_dim_num;
+    grid_dim->z = grid_z;
+    int device_id = paddle::platform::GetCurrentDeviceId();
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int max_threads_per_mp =
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+    int max_threads = max_threads_per_mp * max_mp;
+    // init
+    int num_block = (max_threads / left_num);
+    block_dim->x = details::GetBlockDim(left_num);
+    grid_dim->x = details::AlignUp(left_num, block_dim->x);
+    blocking_size = reduce_num;
+
+    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
+      blocking_size = details::GetLastPow2(reduce_num / num_block);
+      if (blocking_size <= 1) {
+        blocking_size = details::GetLastPow2(sqrt(reduce_num));
+      } else if (blocking_size * 2 < reduce_num) {
+        blocking_size *= 2;
+      }
+      should_reduce_again = true;
+      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
+    }
+  }
+#endif
+
+  void SetBlockDim() {
+    // init
+    int block_num = details::GetBlockDim(reduce_num);
+    should_reduce_again = false;
+    dim3 block_dim(block_num, 1, 1);
+    dim3 grid_dim(left_num, 1, 1);
+    blocking_size = reduce_num;
+#ifdef PADDLE_WITH_XPU_KP
+    if (reduce_last_dim) {
+      block_dim.x = 64;
+      block_dim.y = reduce_num;
+      grid_dim.x = 1;
+      grid_dim.y = 8;
+    } else {
+      block_dim.x = 64;
+      block_dim.y = left_num;
+      grid_dim.x = 8;
+      grid_dim.y = 1;
+    }
+#else
+    if (reduce_type == ReduceType::kReduceHigherDim) {
+      SetBlockDimForHigher(&block_dim, &grid_dim);
+    } else {
+      SetBlockDimForReduceAny(&block_dim, &grid_dim);
+    }
+#endif
+
+    block = block_dim;
+    grid = grid_dim;
+  }
+
+ public:
+  std::vector<int> reduce_dims_origin;
+  std::vector<int> reduce_dim;
+  std::vector<int> x_dim;
+  std::vector<int> left_dim;
+  std::vector<int> x_strides;
+  std::vector<int> left_strides;
+  std::vector<int> reduce_strides;
+
+  int reduce_type;
+  int reduce_num;
+  int left_num;
+  int blocking_size;
+  bool should_reduce_again;
+  bool reduce_last_dim;
+
+  Ty* output_data;
+
+  dim3 block;
+  dim3 grid;
+};
+
+// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+// function will be used
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp,
+          typename Calculator>
+__global__ void ReduceAnyKernel(const Tx* x,
+                                Ty* y,
+                                ReduceOp reducer,
+                                TransformOp transformer,
+                                MPType init,
+                                int reduce_num,
+                                int left_num,
+                                bool reduce_last_dim,
+                                const Calculator reduce_index_calculator,
+                                const Calculator left_index_calculator,
+                                const kps::DimConfig dim) {
+  int input_idx, left_idx, stride;
+  int block_size = 0;
+  bool need_store = true;
+  int loop_left = 0;
+  int tid = 0;
+  // the last dim gets involved in reduction
+  int store_offset = 0;
+  int stride_left = 0;
+  if (reduce_last_dim) {
+    auto block = ReduceIndexMapping<true>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimX();
+    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
+    stride = block.GridDimY() * block.BlockDimX();
+    block_size = block.BlockDimX();
+    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = 1;
+    tid = THREAD_ID_X;
+  } else {
+    auto block = ReduceIndexMapping<false>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimY();
+    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
+    stride = block.GridDimY() * block.BlockDimY();
+    block_size = block.BlockDimY();
+    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = block.BlockDimX() * block.GridDimX();
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    tid = THREAD_ID_Y;
+  }
+  // calculate the offset, means the addr where each thread really start.
+  // 1. reduce for each thread
+  MPType input_compute[REDUCE_VEC_SIZE];
+  Tx input_reg[REDUCE_VEC_SIZE];
+  int input_idx_tmp = input_idx;
+  for (int i = 0; i < loop_left; i += stride_left) {
+    int input_offset = left_index_calculator(left_idx + i);
+    const _ptr_ Tx* input = x + input_offset;
+    MPType reduce_var = init;
+    // load REDUCE_VEC_SIZE data once, and then compute
+    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    input_idx = input_idx_tmp;
+    for (; input_idx + block_size < bound;
+         input_idx += REDUCE_VEC_SIZE * stride) {
+      kps::ReadDataReduce<Tx,
+                          Tx,
+                          1,
+                          REDUCE_VEC_SIZE,
+                          1,
+                          1,
+                          Calculator,
+                          kps::IdentityFunctor<Tx>,
+                          false>(&input_reg[0],
+                                 input,
+                                 input_idx,
+                                 reduce_index_calculator,
+                                 1,
+                                 reduce_num,
+                                 1,
+                                 stride,
+                                 kps::IdentityFunctor<Tx>(),
+                                 reduce_last_dim);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &input_compute[0], &input_reg[0], transformer);
+      kps::Reduce<MPType,
+                  REDUCE_VEC_SIZE,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+    }
+
+    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
+    kps::ReadDataReduce<Tx,
+                        MPType,
+                        1,
+                        REDUCE_VEC_SIZE,
+                        1,
+                        1,
+                        Calculator,
+                        TransformOp,
+                        true>(&input_compute[0],
+                              input,
+                              input_idx,
+                              reduce_index_calculator,
+                              1,
+                              reduce_num - input_idx,
+                              1,
+                              stride,
+                              transformer,
+                              reduce_last_dim);
+    kps::Reduce<MPType,
+                REDUCE_VEC_SIZE,
+                1,
+                1,
+                ReduceOp,
+                kps::details::ReduceMode::kLocalMode>(
+        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+
+    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
+        &reduce_var, &reduce_var, reducer, reduce_last_dim);
+    if (need_store) {
+      y[store_offset + i] = static_cast<Ty>(reduce_var);
+    }
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+__global__ void ReduceHigherDimKernel(const Tx* x,
+                                      Ty* y,
+                                      ReduceOp reducer,
+                                      TransformOp transformer,
+                                      MPType init,
+                                      int reduce_num,
+                                      int left_num,
+                                      int blocking_size,
+                                      const kps::DimConfig dim) {
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  auto block = ReduceIndexMapping<false>(dim);
+  int idy = block.BlockIdY() * blocking_size;
+  int idx = block.BlockIdX() * block.BlockDimX();
+  int idz = BLOCK_ID_Z * left_num;
+  int stride = dim.split_num_x * dim.deal_size_x;
+  int size = left_num - dim.rem_x;
+  int loop_size = min(reduce_num - idy, blocking_size);
+  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
+  int block_offset = idy * left_num + idz * reduce_num;
+  const _ptr_ Tx* input = x + block_offset;
+  Tx reduce_input;
+  for (; idx < size; idx += stride) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
+                                            input + loop_idx * left_num + idx,
+                                            block.BlockDimX(),
+                                            1,
+                                            1,
+                                            left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, false>(
+        y + store_offset + idx, &result, block.BlockDimX());
+  }
+
+  if (idx < left_num) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
+                                           input + loop_idx * left_num + idx,
+                                           dim.rem_x,
+                                           1,
+                                           1,
+                                           left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, true>(
+        y + store_offset + idx, &result, dim.rem_x);
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+static void LaunchReduceKernel(const Tx* x_data,
+                               Ty* y_data,
+                               const ReduceOp& reducer,
+                               const TransformOp& transform,
+                               MPType init,
+                               KPStream stream,
+                               ReduceConfig<Ty> config) {
+  if (config.reduce_type == kReduceLastDim) {
+    int stride_reduce = 1;
+    int stride_left = config.reduce_num;
+    // for higher performance
+    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
+    auto left_index_calculator = OneDimIndexCal(stride_left);
+
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.block.y,
+                                        0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#else
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#endif
+
+  } else {
+    int reduce_rank = config.reduce_strides.size();
+    int left_rank = config.left_strides.size();
+    auto reduce_index_calculator = IndexCalculator(reduce_rank,
+                                                   config.reduce_dim,
+                                                   config.reduce_strides,
+                                                   config.x_strides);
+    auto left_index_calculator = IndexCalculator(
+        left_rank, config.left_dim, config.left_strides, config.x_strides);
+
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.block.y,
+                                        0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#else
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
+#endif
+  }
+
+  if (config.should_reduce_again) {
+    dim3 block;
+    dim3 grid;
+    if (config.reduce_last_dim) {
+      block = dim3(32, 1, 1);
+      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
+    } else {
+      block = dim3(config.block.x, 1, 1);
+      grid = dim3(config.grid.x, 1, config.grid.z);
+    }
+
+    auto last_index = OneDimIndexCal(1);
+    auto first_index = OneDimIndexCal(config.left_num);
+    kps::DimConfig dim =
+        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+    dim.SetRem(config.left_num % block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
+        config.output_data,
+        y_data,
+        reducer,
+        kps::IdentityFunctor<Ty, MPType>(),
+        init,
+        config.grid.y,
+        config.left_num,
+        config.grid.y,
+        dim);
+#else
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+        config.output_data,
+        y_data,
+        reducer,
+        kps::IdentityFunctor<Ty, MPType>(),
+        init,
+        config.grid.y,
+        config.left_num,
+        config.grid.y,
+        dim);
+#endif
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const phi::GPUContext& dev_ctx,
+                    KPStream stream) {
+  auto reducer = ReduceOp<Ty>();
+  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
+                                                                  transform);
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Reduce(nullptr,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+  phi::DenseTensor tmp = phi::Empty<uint8_t, phi::GPUContext>(
+      dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
+
+  auto* temp_storage = dev_ctx.Alloc<uint8_t>(&tmp);
+
+  cub::DeviceReduce::Reduce(temp_storage,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const phi::GPUContext& dev_ctx,
+                    KPStream stream) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
+}
+
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+void ReduceKernel(const phi::GPUContext& dev_ctx,
+                  const phi::DenseTensor& x,
+                  phi::DenseTensor* y,
+                  const TransformOp& transform,
+                  const std::vector<int>& origin_reduce_dims) {
+  auto stream = dev_ctx.stream();
+  dev_ctx.Alloc<Ty>(y);
+
+  auto x_dim = phi::vectorize<int>(x.dims());
+  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
+  config.Run();
+  int numel = x.numel();
+  // after config.run()
+  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
+  // temp_output should be stored temp_data in output_data space or stored in
+  // y_data;
+
+  phi::DDim tmp_ddim;
+  phi::DenseTensor tmp;
+
+  auto x_data = x.data<Tx>();
+  auto y_data = y->data<Ty>();
+
+  if (config.reduce_num == 1) {
+    std::vector<const DenseTensor*> inputs = {&x};
+    std::vector<DenseTensor*> outputs = {y};
+    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
+    return;
+  }
+
+  config.SetOutputData(y_data, dev_ctx, &tmp);
+  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
+  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+#ifndef PADDLE_WITH_XPU_KP
+  if (use_cub_reduce) {
+    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+        x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
+    return;
+  }
+#endif
+
+  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
+  auto reducer = ReduceOp<MPType>();
+  // launch ReduceHigherDimKernel
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
+  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
+  //     32
+  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
+  if (config.reduce_type == ReduceType::kReduceHigherDim) {
+    kps::DimConfig dim = kps::DimConfig(config.grid.x,
+                                        config.grid.y,
+                                        config.grid.z,
+                                        config.block.x,
+                                        config.blocking_size,
+                                        0);
+    dim.SetRem(config.left_num % config.block.x,
+               config.reduce_num % config.blocking_size,
+               0);
+
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<Tx,
+                          Ty,
+                          MPType,
+                          ReduceOp<MPType>,
+                          TransformOp><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
+#else
+    ReduceHigherDimKernel<
+        Tx,
+        Ty,
+        MPType,
+        ReduceOp<MPType>,
+        TransformOp><<<config.grid, config.block, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
+#endif
+
+    if (config.should_reduce_again) {
+      dim3 block = dim3(config.block.x, 1, 1);
+      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
+      kps::DimConfig dim2 =
+          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+      dim2.SetRem(config.left_num % config.block.x, 0, 0);
+
+#ifdef PADDLE_WITH_XPU_KP
+      ReduceHigherDimKernel<
+          Ty,
+          Ty,
+          MPType,
+          ReduceOp<MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
+          config.output_data,
+          y_data,
+          reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+          reducer.initial(),
+          config.grid.y,
+          config.left_num,
+          config.grid.y,
+          dim2);
+#else
+      ReduceHigherDimKernel<
+          Ty,
+          Ty,
+          MPType,
+          ReduceOp<MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+          config.output_data,
+          y_data,
+          reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+          reducer.initial(),
+          config.grid.y,
+          config.left_num,
+          config.grid.y,
+          dim2);
+#endif
+    }
+    return;
+  }
+
+  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+  // function will be used
+  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
+      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
+}
+
+}  // namespace funcs
+
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index aebd155ac59cb200b16d22b9a4b781b3b98fc9b1..c74880e04322474e28385997b5022ebf52643bf4 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -41,5 +41,37 @@ struct ProdFunctor {
   }
 };
 
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
+//////// Min Functor ///////
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+//////// All Functor ///////
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+//////// Any Functor ///////
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3488b6f2f86b20e0b758f3aa75a6739c40cd81db
--- /dev/null
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+
+namespace funcs {
+
+// This ReduceGradFunctor is only the CPU implement.
+template <typename Context, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const Context& dev_ctx,
+                       const DenseTensor& input0,
+                       const DenseTensor& input1,
+                       const DenseTensor& input2,
+                       DenseTensor* output,
+                       Functor functor,
+                       const std::vector<int>& dims) {
+  auto x = phi::EigenTensor<T, D>::From(input0);
+  auto x_grad = phi::EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = phi::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = phi::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  functor(place,
+          &x,
+          &x_reduce,
+          &x_grad,
+          &x_reduce_grad,
+          broadcast_dim,
+          broad_cats_times);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledDim(src_dim, &shuffled_dims, dims_64, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename Context, typename T, typename Functor>
+void HandleLargeDimGrad(const Context& dev_ctx,
+                        const DenseTensor* x,
+                        const DenseTensor* out,
+                        const DenseTensor* dout,
+                        DenseTensor* dx,
+                        Functor functor,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  DenseTensor shuffled_x;
+  std::vector<int64_t> dims_64{dims.begin(), dims.end()};
+  GetShuffledInput<Context, T>(dev_ctx, *x, &shuffled_x, dims_64);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<Context, T, 2, Functor>(
+      dev_ctx, shuffled_x, *out, *dout, dx, functor, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  DenseTensor dx_tmp;
+  paddle::framework::TensorCopy(*dx, dev_ctx.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  phi::funcs::TransposeNormal<Context, T> trans;
+  trans(dev_ctx, dx_tmp, dx, origin_axis);
+}
+
+// Only for CPU
+template <typename Context, typename T, typename Functor>
+void LaunchReduceGradKernel(const Context& dev_ctx,
+                            const DenseTensor* input0,
+                            const DenseTensor* input1,
+                            const DenseTensor* input2,
+                            DenseTensor* output,
+                            Functor functor,
+                            const std::vector<int>& dims,
+                            bool reduce_all = false) {
+  if (reduce_all) {
+    auto x = phi::EigenVector<T>::Flatten(*input0);
+    auto x_reduce = phi::EigenVector<T>::Flatten(*input1);
+    auto x_reduce_grad = phi::EigenVector<T>::Flatten(*input2);
+    auto x_grad = phi::EigenVector<T>::Flatten(*output);
+    auto& place = *dev_ctx.eigen_device();
+    // *dev_ctx.eigen_device();
+    auto broadcast_dim =
+        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+    functor(place,
+            &x,
+            &x_reduce,
+            &x_grad,
+            &x_reduce_grad,
+            broadcast_dim,
+            broadcast_dim[0]);
+  } else {
+    int rank = input0->dims().size();
+    switch (rank) {
+      case 1:
+        ReduceGradFunctor<Context, T, 1, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 2:
+        ReduceGradFunctor<Context, T, 2, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 3:
+        ReduceGradFunctor<Context, T, 3, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 4:
+        ReduceGradFunctor<Context, T, 4, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 5:
+        ReduceGradFunctor<Context, T, 5, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      case 6:
+        ReduceGradFunctor<Context, T, 6, Functor>(
+            dev_ctx, *input0, *input1, *input2, output, functor, dims);
+        break;
+      default:
+        HandleLargeDimGrad<Context, T, Functor>(
+            dev_ctx, input0, input1, input2, output, functor, dims);
+        break;
+    }
+  }
+}
+
+}  // namespace funcs
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
similarity index 65%
rename from paddle/fluid/operators/math/segment_pooling.cc
rename to paddle/phi/kernels/funcs/segment_pooling.cc
index d16fc570a9fb0dbf0440ca3d1105645056aa13d2..bf4a21f37223dab5a67649406496e9828b0bcf3f 100644
--- a/paddle/fluid/operators/math/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -12,45 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 
-namespace paddle {
-namespace operators {
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
+
+using Tensor = DenseTensor;
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* index,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = input.numel() / input.dims()[0];
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_t = output->Slice(curent_id, curent_id + 1);
       Tensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_e = framework::EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
-      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
 
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       if (pooltype == "MEAN") {
@@ -62,7 +69,7 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MIN") {
         out_e.device(place) = in_e.minimum(reduce_dim);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -75,36 +82,41 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* index = nullptr,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = in_grad->numel() / in_grad->dims()[0];
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
       Tensor in_g_t = in_grad->Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "MEAN") {
@@ -114,13 +126,13 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MAX" || pooltype == "MIN") {
         Tensor out_t = output.Slice(curent_id, curent_id + 1);
         Tensor in_t = input.Slice(last_idx, idx);
-        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
-        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        auto in_e = EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = EigenMatrix<T>::From(out_t, {1, w});
         in_g_e.device(place) =
             (in_e == out_e.broadcast(bcast)).template cast<T>() *
             out_g_e.broadcast(bcast);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -132,7 +144,7 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
   }
 };
 
-using CPU = platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
@@ -142,5 +154,5 @@ template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/segment_pooling.cu
rename to paddle/phi/kernels/funcs/segment_pooling.cu
index fbdcb99c02ab97335e595d69b6215cd6a018a33a..305cd39f077bc359543b399a8775b5a92a2eb00d 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,20 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
 #include <algorithm>
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-using Tensor = framework::Tensor;
+using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+__global__ void SegmentSumIdsKernel(const Index* segment_ids,
+                                    T* summed_ids,
                                     const Index input_length_size,
                                     const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
@@ -45,16 +49,19 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "the segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
       if (current_segment_id > last_segment_id) {
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(summed_ids + interval_id) = 0;
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -64,13 +71,15 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
-                                  T* output, T* summed_ids,
+__global__ void SegmentMeanKernel(const Index* segment_ids,
+                                  const T* input,
+                                  T* output,
+                                  T* summed_ids,
                                   const Index input_length_size,
                                   const Index inner_dim_size,
                                   const Index output_length_size,
@@ -93,7 +102,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
 
@@ -102,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+            paddle::platform::CudaAtomicAdd(
+                output + output_index, sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -114,15 +124,14 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    platform::CudaAtomicAdd(output + output_index,
-                            sum / *(summed_ids + last_segment_id));
+    paddle::platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
   }
 }
 
 template <typename T, typename Index, typename Helper, typename Pool>
-__global__ void __launch_bounds__(1024, 1)
-    SegmentOpsKernel(const Index* segment_ids, const T* input, T* output,
-                     Helper h, Pool pool) {
+__global__ void __launch_bounds__(1024, 1) SegmentOpsKernel(
+    const Index* segment_ids, const T* input, T* output, Helper h, Pool pool) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
@@ -142,13 +151,16 @@ __global__ void __launch_bounds__(1024, 1)
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "The segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
 
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
@@ -175,9 +187,12 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <typename T, typename Index, typename Helper>
-__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
-                                       const T* output, const T* out_grad,
-                                       T* in_grad, Helper h) {
+__global__ void SegmentIndexGradKernel(const Index* segment_ids,
+                                       const T* input,
+                                       const T* output,
+                                       const T* out_grad,
+                                       T* in_grad,
+                                       Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
@@ -201,7 +216,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMax(address, val);
+    return paddle::platform::CudaAtomicMax(address, val);
   }
 };
 
@@ -211,7 +226,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMin(address, val);
+    return paddle::platform::CudaAtomicMin(address, val);
   }
 };
 
@@ -221,7 +236,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicAdd(address, val);
+    return paddle::platform::CudaAtomicAdd(address, val);
   }
 };
 
@@ -243,8 +258,10 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T* segment_offset,
-                               T* dim_index_base, T* actual_height) {
+  DEVICE inline void calculate(T stripe_index,
+                               T* segment_offset,
+                               T* dim_index_base,
+                               T* actual_height) {
     *segment_offset = stripe_index % inner_dim_size;
     *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
     *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
@@ -252,23 +269,32 @@ class ArrangeHelper {
 };
 
 template <typename T, typename Index>
-void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
-                                const framework::Tensor& input,
-                                const framework::Tensor& segment_ids,
-                                const framework::Tensor& output,
-                                const framework::Tensor& out_grad,
-                                framework::Tensor* in_grad,
+void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
+                                const DenseTensor& input,
+                                const DenseTensor& segment_ids,
+                                const DenseTensor& output,
+                                const DenseTensor& out_grad,
+                                DenseTensor* in_grad,
                                 const std::string pooltype = "SUM") {
-  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
-                                output.dims()[0]);
-  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  auto h = ArrangeHelper<Index>(
+      input.numel(), segment_ids.dims()[0], output.dims()[0]);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
-        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
-        out_grad.data<T>(), in_grad->data<T>(), h);
+    SegmentIndexGradKernel<T,
+                           Index,
+                           ArrangeHelper<Index>><<<config.block_per_grid.x,
+                                                   config.thread_per_block.x,
+                                                   0,
+                                                   ctx.stream()>>>(
+        segment_ids.data<Index>(),
+        input.data<T>(),
+        output.data<T>(),
+        out_grad.data<T>(),
+        in_grad->data<T>(),
+        h);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
         "available, but got %s.",
         pooltype));
@@ -291,13 +317,13 @@ __global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
 }
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segment_ids,
-                  framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segment_ids,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MEAN") {
       // Sum the segment id num first
@@ -305,50 +331,76 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
       auto input_length_size = segment_ids.numel();
       auto total_stripe_count =
           (input_length_size + DimTileSize - 1) / DimTileSize;
-      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                  config.thread_per_block.x,
+                                                  0,
+                                                  ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          summed_ids->data<T>(),
+          input_length_size,
           total_stripe_count);
     }
 
-    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
-                                   output->dims()[0]);
-    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    auto h = ArrangeHelper<IndexT>(
+        input.numel(), segment_ids.dims()[0], output->dims()[0]);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
-          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
-          h.output_length_size, h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                config.thread_per_block.x,
+                                                0,
+                                                ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          input.data<T>(),
+          output->data<T>(),
+          summed_ids->data<T>(),
+          h.input_length_size,
+          h.inner_dim_size,
+          h.output_length_size,
+          h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       SumPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MaxPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MinPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -357,33 +409,38 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
-      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
-                                            out_grad, in_grad, pooltype);
+      SegmentPoolCUDAGradFunctor<T, IndexT>(
+          dev_ctx, input, segments, output, out_grad, in_grad, pooltype);
     } else if (pooltype == "MEAN") {
-      framework::Tensor mean_grad;
-      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
-      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      DenseTensor mean_grad;
+      mean_grad.Resize(input.dims());
+      dev_ctx.template Alloc<T>(&mean_grad);
+      paddle::framework::TensorCopy(
+          out_grad, dev_ctx.GetPlace(), dev_ctx, &mean_grad);
       int len = output.dims()[0];
       int dim = output.numel() / len;
-      auto config = platform::GetGpuLaunchConfig1D(context, len);
-      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                     context.stream()>>>(mean_grad.data<T>(),
-                                         summed_ids->data<T>(), len, dim);
-      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len);
+      SimpleDiv<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(
+          mean_grad.data<T>(), summed_ids->data<T>(), len, dim);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, out_grad, segments, in_grad);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -391,15 +448,15 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
   }
 };
 
-using CUDA = paddle::platform::CUDADeviceContext;
-template class SegmentPoolFunctor<CUDA, float, int>;
-template class SegmentPoolFunctor<CUDA, float, int64_t>;
-template class SegmentPoolFunctor<CUDA, double, int>;
-template class SegmentPoolFunctor<CUDA, double, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, float, int>;
-template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, double, int>;
-template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle
+using GPU = phi::GPUContext;
+template class SegmentPoolFunctor<GPU, float, int>;
+template class SegmentPoolFunctor<GPU, float, int64_t>;
+template class SegmentPoolFunctor<GPU, double, int>;
+template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, float, int>;
+template class SegmentPoolGradFunctor<GPU, float, int64_t>;
+template class SegmentPoolGradFunctor<GPU, double, int>;
+template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
similarity index 51%
rename from paddle/fluid/operators/math/segment_pooling.h
rename to paddle/phi/kernels/funcs/segment_pooling.h
index 561fad6921fe7b9e61f6ea4bc33d820a6af25262..b8281061582ea6cddb2dbcbf490c3dfa50c4f64f 100644
--- a/paddle/fluid/operators/math/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,33 +14,36 @@ limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM");
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolGradFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM");
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a50dceb0a00758b2b0ad5f92219812083cb5f24
--- /dev/null
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T, size_t D>
+void EigenSliceWrapper(const Context& dev_ctx,
+                       const DenseTensor* in,
+                       const std::vector<int>& start,
+                       const std::vector<int>& end,
+                       DenseTensor* out) {
+  // Slice by call Eigen Tensor Function `.slice()`
+  size_t rank = in->dims().size();
+  PADDLE_ENFORCE_EQ(start.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function start "
+                        "argument must have the same length as input rank."));
+  PADDLE_ENFORCE_EQ(end.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function end "
+                        "argument must have the same length as input rank."));
+  auto eigen_place_ptr = dev_ctx.eigen_device();
+  auto eigen_place = *eigen_place_ptr;
+  auto out_t = phi::EigenTensor<T, D>::From(*out, out->dims());
+  auto in_t = phi::EigenTensor<T, D>::From(*in, in->dims());
+  Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+  for (size_t i = 0; i < D; i++) {
+    offsets_32bit[i] = start[i];
+    extents_32bit[i] = end[i];
+  }
+  EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+      eigen_place,
+      phi::To32BitIndex(out_t),
+      phi::To32BitIndex(in_t),
+      offsets_32bit,
+      extents_32bit);
+}
+
+#define SLICE_RANK_CASE(N)                                                \
+  case N: {                                                               \
+    EigenSliceWrapper<Context, T, N>(dev_ctx, &x, offset, extends, &ret); \
+    break;                                                                \
+  }
+
+template <typename T, typename Context>
+DenseTensor Slice(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  std::vector<int> axes,
+                  std::vector<int> starts,
+                  std::vector<int> ends) {
+  DenseTensor ret;
+  std::vector<int> new_axes = axes;
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  size_t rank = out_shape.size();
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  PADDLE_ENFORCE_EQ(
+      ends.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  for (unsigned int i = 0; i < axes.size(); ++i) {
+    int axis = axes[i];
+    if (axis < 0) axis = rank + axis;
+    new_axes[i] = axis;  // change negative to positive
+    int st = starts[i];
+    int ed = ends[i];
+    PADDLE_ENFORCE_GT(
+        ed,
+        st,
+        errors::InvalidArgument("C++ Slice Operation Not Support End < Start"));
+    out_shape[axis] = ed - st;
+  }
+  std::vector<int> offset(rank), extends(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    offset[i] = 0;
+    extends[i] = x.dims()[i];
+  }
+  for (size_t i = 0; i < new_axes.size(); ++i) {
+    offset[new_axes[i]] = starts[i];
+    extends[new_axes[i]] = ends[i] - starts[i];
+  }
+  ret.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&ret);
+  switch (rank) {
+    SLICE_RANK_CASE(1);
+    SLICE_RANK_CASE(2);
+    SLICE_RANK_CASE(3);
+    SLICE_RANK_CASE(4);
+    SLICE_RANK_CASE(5);
+    SLICE_RANK_CASE(6);
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Invalid Rank number, "
+                                  "currently only support rank between 2~6"));
+    }
+  }
+  return ret;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..3617e3cd2f406d889c0b79ecfc34a68d19259a17
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+inline const DDim InferDenseDims(const DDim& x_dims,
+                                 const int64_t sparse_dim,
+                                 const int64_t non_zero_num) {
+  auto dense_dim = x_dims.size() - sparse_dim;
+  DDim values_dims;
+  if (dense_dim > 0) {
+    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
+    dense_dim_vec[0] = non_zero_num;
+    memcpy(&dense_dim_vec[1],
+           x_dims.Get() + sparse_dim,
+           dense_dim * sizeof(x_dims[0]));
+    values_dims = phi::make_ddim(dense_dim_vec);
+  } else {
+    values_dims = phi::make_ddim({non_zero_num});
+  }
+  return values_dims;
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..68fe8880a971dd7a56d677a5567bb053f5ba117a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -0,0 +1,170 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+struct Dims4D {
+  int dims[4];
+  Dims4D(const int batch, const int x, const int y, const int z) {
+    dims[0] = batch;
+    dims[1] = z;
+    dims[2] = y;
+    dims[3] = x;
+  }
+  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
+};
+
+// Judge whether the current position x is in (lower, upper)
+inline HOSTDEVICE bool Check(const int& x,
+                             const int& kx,
+                             const int& pad,
+                             const int& stride,
+                             const int dilation,
+                             const int kdim,
+                             const int xdim) {
+  const int lower = x - dilation * kx + pad;
+  const int uper = x + (kdim - kx - 1) * dilation - pad;
+  return (lower >= 0 && lower % stride == 0 && uper < xdim);
+}
+
+// Check whether the current position(x, y, z) is legal:
+// Judge the minimum and maximum values at each latitude
+inline HOSTDEVICE bool Check(const Dims4D& dims,
+                             const Dims4D& kernel_dims,
+                             const Dims4D& paddings,
+                             const Dims4D& dilations,
+                             const Dims4D& strides,
+                             const int x,
+                             const int y,
+                             const int z,
+                             const int kx,
+                             const int ky,
+                             const int kz) {
+  bool x_valid = Check(
+      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
+  bool y_valid = Check(
+      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
+  bool z_valid = Check(
+      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
+  return (x_valid && y_valid && z_valid);
+}
+
+template <typename Dim>
+inline HOSTDEVICE int PointToIndex(const int& batch,
+                                   const int& x,
+                                   const int& y,
+                                   const int& z,
+                                   const Dim& dims) {
+  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
+         y * dims[3] + x;
+}
+
+// TODO(zhangkaihuo): use division and multiply to optimize
+// modulo operation
+template <typename Dim>
+inline HOSTDEVICE void IndexToPoint(
+    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
+  int n = index;
+  *x = n % dims[3];
+  n /= dims[3];
+  *y = n % dims[2];
+  n /= dims[2];
+  *z = n % dims[1];
+  n /= dims[1];
+  *batch = n;
+}
+
+inline void GetOutShape(const DDim& x_dims,
+                        const DDim& kernel_dims,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DDim* out_dims) {
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
+  PADDLE_ENFORCE_EQ(kernel_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "the shape of kernel should be (D, H, W, C, OC)"));
+
+  // infer out shape
+  (*out_dims)[0] = x_dims[0];
+  (*out_dims)[4] = kernel_dims[4];
+  for (int i = 1; i < 4; i++) {
+    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
+                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
+                         strides[i - 1] +
+                     1;
+  }
+}
+
+inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims,
+                                          std::vector<int>* paddings,
+                                          std::vector<int>* strides) {
+  for (uint64_t i = 0; i < paddings->size(); i++) {
+    (*paddings)[i] = kernel_dims[i] / 2;
+    (*strides)[i] = 1;
+  }
+}
+
+template <typename T, typename Context>
+inline void SubmPreProcess(const Context& dev_ctx,
+                           const SparseCooTensor& x,
+                           const DenseTensor& kernel,
+                           const SparseCooTensor& out_grad,
+                           const int in_channels,
+                           const int out_channels,
+                           const int half_kernel_size,
+                           DenseTensor* kernel_grad,
+                           DenseTensor* x_grad) {
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  blas.GEMM(CblasTrans,
+            CblasNoTrans,
+            x.non_zero_elements().dims()[1],
+            out_grad.non_zero_elements().dims()[1],
+            x.non_zero_elements().dims()[0],
+            static_cast<T>(1),
+            x.non_zero_elements().data<T>(),
+            out_grad.non_zero_elements().data<T>(),
+            static_cast<T>(0),
+            d_kernel_ptr + half_kernel_size * in_channels * out_channels);
+
+  // call gemm: d_x = out_grad * transpose(kernel)
+  // (n, out_channels) * (out_channels, in_channels)
+  T* x_grad_ptr = x_grad->data<T>();
+  blas.GEMM(CblasNoTrans,
+            CblasTrans,
+            out_grad.non_zero_elements().dims()[0],
+            in_channels,
+            out_grad.non_zero_elements().dims()[1],
+            static_cast<T>(1),
+            out_grad.non_zero_elements().data<T>(),
+            kernel.data<T>() + half_kernel_size * in_channels * out_channels,
+            static_cast<T>(0),
+            x_grad_ptr);
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b8a81471ef769dc5ddf18889f60813641d86d22
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
+  // don't copy data, only change the dims
+  DenseTensor out(x);
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  if (axis >= 0) {
+    auto index = (out_shape.begin() + axis);
+    out_shape.insert(index, 1);
+  } else if (axis < 0) {
+    auto index = (out_shape.end() + axis + 1);
+    out_shape.insert(index, 1);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  return out;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..336e9c809427c68be79bc8eaddd98193462f5405
--- /dev/null
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -0,0 +1,385 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/memory.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cusolver.h"
+#endif  // PADDLE_WITH_CUDA
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+inline int64_t GetBatchSize(phi::DDim dims) {
+  int64_t batch_size = 1;
+  auto dim_size = dims.size();
+  for (int i = 0; i < dim_size - 2; i++) {
+    batch_size *= dims[i];
+  }
+  return batch_size;
+}
+
+static void CheckEighResult(const int batch, const int info) {
+  PADDLE_ENFORCE_LE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
+          "tridiagonal form did not converge to zero",
+          batch,
+          info));
+  PADDLE_ENFORCE_GE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] argument had an illegal value",
+          batch,
+          info));
+}
+
+template <typename DeviceContext, typename T>
+struct MatrixEighFunctor {
+  void operator()(const DeviceContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors);
+};
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices, and uses the variable has_vectors to
+// control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<CPUContext, T> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    auto dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    int values_stride = dims[dim_size - 1];
+    char uplo = is_lower ? 'L' : 'U';
+    char jobz = has_vectors ? 'V' : 'N';
+    int n = dims[dim_size - 1];
+    int64_t lda = std::max<int64_t>(1, n);
+    // if work = -1, it means that you need to use the lapack function to query
+    // the optimal value
+    int lwork = -1;      // The length of the array work
+    int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
+    int liwork = -1;     // The dimension of the array iwork
+    int iwork_opt = -1;  // The optimal length of the array liwork
+    T lwork_opt = static_cast<T>(-1);  // The optimal length of the array work
+    ValueType rwork_opt =
+        static_cast<ValueType>(-1);  // The optimal length of the array rwork
+
+    int info = 0;
+    // Call lapackEigh to get the optimal size of work data
+    phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                         uplo,
+                                         n,
+                                         input_vector,
+                                         lda,
+                                         out_value,
+                                         &lwork_opt,
+                                         lwork,
+                                         &rwork_opt,
+                                         lrwork,
+                                         &iwork_opt,
+                                         liwork,
+                                         &info);
+    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
+    liwork = std::max<int>(1, iwork_opt);
+
+    DenseTensor rwork_tensor;
+    ValueType *rwork_data = nullptr;
+
+    // complex type
+    if (input.type() == phi::DataType::COMPLEX64 ||
+        input.type() == phi::DataType::COMPLEX128) {
+      lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
+
+      rwork_tensor.Resize(phi::make_ddim({lrwork}));
+      rwork_data = dev_ctx.template Alloc<ValueType>(&rwork_tensor);
+    }
+
+    DenseTensor iwork_tensor, work_tensor;
+
+    iwork_tensor.Resize(phi::make_ddim({liwork}));
+    int *iwork_data = dev_ctx.template Alloc<int>(&iwork_tensor);
+
+    work_tensor.Resize(phi::make_ddim({lwork}));
+    T *work_data = dev_ctx.template Alloc<T>(&work_tensor);
+
+    for (auto i = 0; i < batch_size; i++) {
+      auto *value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                           uplo,
+                                           n,
+                                           input_data,
+                                           lda,
+                                           value_data,
+                                           work_data,
+                                           lwork,
+                                           rwork_data,
+                                           lrwork,
+                                           iwork_data,
+                                           liwork,
+                                           &info);
+      CheckEighResult(i, info);
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated, "
+                                  "so the eigenvectors must be provided."));
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices on GPU, and uses the variable has_vectors
+// to control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<GPUContext, T> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+    auto &dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    cublasFillMode_t uplo =
+        is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+    cusolverEigMode_t jobz =
+        has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
+
+    int n = dims[dim_size - 1];
+    int lda = std::max<int>(1, n);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
+    int lwork = 0;
+    auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
+    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
+
+    // When the input type is float32, and the feature value input dimension
+    // is greater than or equal to [*,32,32]  and less than or equal to
+    // [*,512,512], Syevj has better performance.
+    bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
+                      values_stride >= 32 && values_stride <= 512);
+    syevjInfo_t syevj_params;
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+          dev_ctx.cusolver_dn_handle(),
+          jobz,
+          uplo,
+          n,
+          reinterpret_cast<const float *>(input_vector),
+          lda,
+          reinterpret_cast<const float *>(out_value),
+          &lwork,
+          syevj_params));
+    } else {
+      EvdBuffer(dev_ctx.cusolver_dn_handle(),
+                jobz,
+                uplo,
+                n,
+                input_vector,
+                lda,
+                out_value,
+                &lwork);
+    }
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    for (auto i = 0; i < batch_size; i++) {
+      auto *input_data = input_vector + i * vector_stride;
+      auto *value_data = out_value + i * values_stride;
+      auto handle = dev_ctx.cusolver_dn_handle();
+      if (use_syevj) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::cusolverDnSsyevj(handle,
+                                      jobz,
+                                      uplo,
+                                      n,
+                                      reinterpret_cast<float *>(input_data),
+                                      lda,
+                                      reinterpret_cast<float *>(value_data),
+                                      reinterpret_cast<float *>(work_ptr),
+                                      lwork,
+                                      info_ptr,
+                                      syevj_params));
+      } else {
+        Evd(handle,
+            jobz,
+            uplo,
+            n,
+            input_data,
+            lda,
+            value_data,
+            work_ptr,
+            lwork,
+            info_ptr);
+      }
+      int error_info = 0;
+      paddle::memory::Copy(phi::CPUPlace(),
+                           &error_info,
+                           dev_ctx.GetPlace(),
+                           info_ptr,
+                           sizeof(int),
+                           dev_ctx.stream());
+      CheckEighResult(i, error_info);
+    }
+
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnDestroySyevjInfo(syevj_params));
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated,"
+                                  "so the eigenvectors must be provided."));
+      //   input_trans = dito.Transpose(input_trans);
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+
+  using ValueType = phi::dtype::Real<T>;
+  inline void EvdBuffer(cusolverDnHandle_t handle,
+                        cusolverEigMode_t jobz,
+                        cublasFillMode_t uplo,
+                        int n,
+                        const T *A,
+                        int lda,
+                        const ValueType *W,
+                        int *lwork) const;
+
+  inline void Evd(cusolverDnHandle_t handle,
+                  cusolverEigMode_t jobz,
+                  cublasFillMode_t uplo,
+                  int n,
+                  T *A,
+                  int lda,
+                  ValueType *W,
+                  T *work,
+                  int lwork,
+                  int *devInfo) const;
+};
+
+using phi::dtype::complex;
+
+#define FUNC_WITH_TYPES(m)                       \
+  m(float, Ssy, float) m(double, Dsy, double) m( \
+      complex<float>, Che, cuComplex) m(complex<double>, Zhe, cuDoubleComplex)
+
+#define EVDBUFFER_INSTANCE(T, C, CastType)                             \
+  template <>                                                          \
+  inline void MatrixEighFunctor<GPUContext, T>::EvdBuffer(             \
+      cusolverDnHandle_t handle,                                       \
+      cusolverEigMode_t jobz,                                          \
+      cublasFillMode_t uplo,                                           \
+      int n,                                                           \
+      const T *A,                                                      \
+      int lda,                                                         \
+      const ValueType *W,                                              \
+      int *lwork) const {                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##evd_bufferSize( \
+        handle,                                                        \
+        jobz,                                                          \
+        uplo,                                                          \
+        n,                                                             \
+        reinterpret_cast<const CastType *>(A),                         \
+        lda,                                                           \
+        W,                                                             \
+        lwork));                                                       \
+  }
+
+FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
+
+#define EVD_INSTANCE(T, C, CastType)                                           \
+  template <>                                                                  \
+  inline void MatrixEighFunctor<GPUContext, T>::Evd(cusolverDnHandle_t handle, \
+                                                    cusolverEigMode_t jobz,    \
+                                                    cublasFillMode_t uplo,     \
+                                                    int n,                     \
+                                                    T *A,                      \
+                                                    int lda,                   \
+                                                    ValueType *W,              \
+                                                    T *work,                   \
+                                                    int lwork,                 \
+                                                    int *devInfo) const {      \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
+        dynload::cusolverDn##C##evd(handle,                                    \
+                                    jobz,                                      \
+                                    uplo,                                      \
+                                    n,                                         \
+                                    reinterpret_cast<CastType *>(A),           \
+                                    lda,                                       \
+                                    W,                                         \
+                                    reinterpret_cast<CastType *>(work),        \
+                                    lwork,                                     \
+                                    devInfo));                                 \
+  }
+
+FUNC_WITH_TYPES(EVD_INSTANCE);
+
+#undef FUNC_WITH_TYPES
+#undef EVDBUFFER_INSTANCE
+#undef EVD_INSTANCE
+
+#endif  // PADDLE_WITH_CUDA
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/viterbi_decode_functor.h b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..b80fd5356b6e836b49b8868315b6c65342b1d84a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T, typename Functor, typename OutT = T>
+void SameDimsBinaryOP(const DenseTensor& lhs,
+                      const DenseTensor& rhs,
+                      DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  OutT* out_ptr = out->data<OutT>();
+  Functor functor;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
+  }
+}
+
+template <bool is_multi_threads>
+struct GetInputIndex {
+  void operator()(const std::vector<int>& lhs_dims,
+                  const std::vector<int>& rhs_dims,
+                  const std::vector<int>& output_dims,
+                  const std::vector<int>& lhs_strides,
+                  const std::vector<int>& rhs_strides,
+                  const std::vector<int>& output_strides,
+                  int output_idx,
+                  int* index_array,
+                  int* lhs_idx,
+                  int* rhs_idx) {
+    int out_dims_size = output_strides.size();
+    for (int j = 0; j < out_dims_size; ++j) {
+      int curr_idx = output_idx / output_strides[j];
+      output_idx %= output_strides[j];
+      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
+      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
+    }
+  }
+};
+
+template <typename T, typename Functor, bool is_multi_threads = false>
+void SimpleBroadcastBinaryOP(const DenseTensor& lhs,
+                             const DenseTensor& rhs,
+                             DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  T* out_ptr = out->data<T>();
+  int out_size = static_cast<int>(out->dims().size());
+  std::vector<int> out_dims(out_size);
+  std::vector<int> lhs_dims(out_size);
+  std::vector<int> rhs_dims(out_size);
+  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
+  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
+  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
+  std::vector<int> output_strides(out_size, 1);
+  std::vector<int> lhs_strides(out_size, 1);
+  std::vector<int> rhs_strides(out_size, 1);
+  std::vector<int> index_array(out_size, 0);
+  // calculate strides
+  for (int i = out_size - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
+    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
+    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
+  }
+  Functor functor;
+  GetInputIndex<is_multi_threads> get_input_index;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    int lhs_idx = 0;
+    int rhs_idx = 0;
+    get_input_index(lhs_dims,
+                    rhs_dims,
+                    out_dims,
+                    lhs_strides,
+                    rhs_strides,
+                    output_strides,
+                    i,
+                    index_array.data(),
+                    &lhs_idx,
+                    &rhs_idx);
+    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
+  }
+}
+
+class TensorBuffer {
+ public:
+  explicit TensorBuffer(const DenseTensor& in) : buffer_(in), offset_(0) {
+    buffer_.Resize({buffer_.numel()});
+  }
+  DenseTensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+    int64_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+    DenseTensor block = buffer_.Slice(offset_, offset_ + size);
+    offset_ += size;
+    block.Resize(shape);
+    return block;
+  }
+
+ private:
+  DenseTensor buffer_;  // need to resize 1-D Tensor
+  int offset_;
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/yolo_box_util.h b/paddle/phi/kernels/funcs/yolo_box_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..337af2d7a236e9ea93c4eecf835ad9ca446b5276
--- /dev/null
+++ b/paddle/phi/kernels/funcs/yolo_box_util.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box,
+                                  const T* x,
+                                  const int* anchors,
+                                  int i,
+                                  int j,
+                                  int an_idx,
+                                  int grid_size_h,
+                                  int grid_size_w,
+                                  int input_size_h,
+                                  int input_size_w,
+                                  int index,
+                                  int stride,
+                                  int img_height,
+                                  int img_width,
+                                  float scale,
+                                  float bias) {
+  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
+  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
+           grid_size_h;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size_w;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size_h;
+}
+
+HOSTDEVICE inline int GetEntryIndex(int batch,
+                                    int an_idx,
+                                    int hw_idx,
+                                    int an_num,
+                                    int an_stride,
+                                    int stride,
+                                    int entry,
+                                    bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+HOSTDEVICE inline int GetIoUIndex(
+    int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes,
+                                        T* box,
+                                        const int box_idx,
+                                        const int img_height,
+                                        const int img_width,
+                                        bool clip_bbox) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+    boxes[box_idx + 1] =
+        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                             ? boxes[box_idx + 2]
+                             : static_cast<T>(img_width - 1);
+    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                             ? boxes[box_idx + 3]
+                             : static_cast<T>(img_height - 1);
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores,
+                                      const T* input,
+                                      const int label_idx,
+                                      const int score_idx,
+                                      const int class_num,
+                                      const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_nd_grad_kernel.h b/paddle/phi/kernels/gather_nd_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..050034714957fe749d6608243dfedc4d30d66e88
--- /dev/null
+++ b/paddle/phi/kernels/gather_nd_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradNdKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_nd_kernel.h b/paddle/phi/kernels/gather_nd_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2393eb3b0709345cb2d3ec63d739cb223a79683
--- /dev/null
+++ b/paddle/phi/kernels/gather_nd_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_tree_kernel.h b/paddle/phi/kernels/gather_tree_kernel.h
index e5a1a684daef099b5da8e7d9b8469b2857c29a6b..b3e6ffbc4297a2ae6a067e6b1ec5f2f88f7ef2ba 100644
--- a/paddle/phi/kernels/gather_tree_kernel.h
+++ b/paddle/phi/kernels/gather_tree_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2903d80d22d46bcdc492009afdac4e6e6572929e
--- /dev/null
+++ b/paddle/phi/kernels/gaussian_random_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f08fb74e54d8c86f7b54d21c762e30cebedfe967
--- /dev/null
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   float* accuracy,
+                                   int* total_data) {
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+// reduce the count with init value 0, and output accuracy.
+#ifdef PADDLE_WITH_CUDA
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+#else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+#endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<
+      PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      num_samples,
+      infer_width,
+      indices_data,
+      label_data,
+      correct_data,
+      accuracy_data,
+      total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyRawKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..00792b8ab607036112295f2dd4018c69eb78680a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -0,0 +1,236 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradGPUImpl(const Context& dev_ctx,
+                           const DenseTensor* x,
+                           const DenseTensor* out,
+                           const DenseTensor* d_out,
+                           DenseTensor* d_x,
+                           const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      d_x, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!out) {
+    out = d_out;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        x, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    x = d_x;
+  }
+
+  dev_ctx.template Alloc<T>(d_x);
+
+  std::vector<const DenseTensor*> ins = {d_out};
+  std::vector<DenseTensor*> outs = {d_x};
+
+  if (static_cast<int>(Functor::FwdDeps()) ==
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    // Only need forward output Out
+    ins.push_back(out);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else if (static_cast<int>(Functor::FwdDeps()) ==
+             static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    // Only need forward input X
+    ins.push_back(x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(     \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(         \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(         \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(       \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(       \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
+                                               CudaLeakyReluGradFunctor,
+                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu,
+                                               CudaThresholdedReluGradFunctor,
+                                               threshold);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
+                                               CudaBReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                             \
+                     GPU,                              \
+                     ALL_LAYOUT,                       \
+                     phi::func,                        \
+                     float,                            \
+                     double,                           \
+                     phi::dtype::float16,              \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
+                                   LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c340a89f5746bd8de31826f7639e6ed0b7391f6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -0,0 +1,144 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGPUImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out,
+                       const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(                                                        \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
+    functor_class functor;                                                  \
+    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    float attr1,                            \
+                    float attr2,                            \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor<T>)
+
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     CudaThresholdedReluFunctor,
+                                     threshold)
+
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                        \
+                     GPU,                         \
+                     ALL_LAYOUT,                  \
+                     phi::func,                   \
+                     float,                       \
+                     double,                      \
+                     phi::dtype::float16,         \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7516a277a746f12ec7c6326b7ebd8f64c789fc31
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0817c531318c390ead201b1fda18da511ea5569f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af2612bb10c9fe108a471253ff87f2a686059c2a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/allclose_kernel.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AllcloseCUDAKernel(const T* in_data,
+                                   const T* other_data,
+                                   const double rtol,
+                                   const double atol,
+                                   bool equal_nan,
+                                   int num,
+                                   bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    if (!val) *out_data = false;
+  }
+}
+
+template <typename T, typename Context>
+void AllCloseKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const Scalar& rtol,
+                    const Scalar& atol,
+                    bool equal_nan,
+                    DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Rtol) type must be double, but get %s.", rtol.dtype()));
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument(
+          "Input (Atol) type must be double, but get %s.", atol.dtype()));
+
+  const T* in_data = x.data<T>();
+  const T* other_data = y.data<T>();
+  auto rtol_v = rtol.to<double>();
+  auto atol_v = atol.to<double>();
+  bool* out_data = dev_ctx.template Alloc<bool>(out);
+
+  int num = x.numel();
+  int block = 1024;
+  int grid = (block - 1 + num) / block;
+  grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(out_data, true, sizeof(bool));
+#else
+  cudaMemset(out_data, true, sizeof(bool));
+#endif
+  AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      in_data, other_data, rtol_v, atol_v, equal_nan, num, out_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    allclose, GPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
+}
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6feee512cc9f4ec411167d1dc26feed1d766787d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include <limits>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer,
+                              const T init,
+                              const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const phi::GPUContext& dev_ctx,
+                    const DenseTensor& input,
+                    DenseTensor* indices,
+                    const int64_t pre,
+                    const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = dev_ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    auto block_size = 8;
+    if (col > 512)
+      block_size = 1024;
+    else if (col > 256)
+      block_size = 512;
+    else if (col > 128)
+      block_size = 256;
+    else if (col > 64)
+      block_size = 128;
+    else if (col > 32)
+      block_size = 64;
+    else if (col > 16)
+      block_size = 32;
+    else if (col > 8)
+      block_size = 16;
+#ifdef __HIPCC__
+    block_size = std::min(block_size, 256);
+#endif
+    return block_size;
+  };
+
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = dev_ctx.template Alloc<IndType>(indices);
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::lowest(),
+              in_data,
+              out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::max(),
+              in_data,
+              out_data));
+    }
+  }
+}
+
+template <typename Context, typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         int64_t axis,
+                                         bool keepdims,
+                                         bool flatten,
+                                         DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+
+  template <typename IndType>
+  void apply() const {
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x.dims().size();
+    }
+
+    int64_t numel = x.numel();
+    int64_t groups = numel / x_dims[new_axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = x_dims[new_axis];
+
+    for (int i = 0; i < new_axis; i++) {
+      pre *= x_dims[i];
+    }
+
+    for (int i = new_axis + 1; i < x_dims.size(); i++) {
+      post *= x_dims[i];
+    }
+
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, x, out, pre, post, n);
+  }
+};
+
+template <typename Context, typename T, class Reducer>
+void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           int64_t axis,
+                           bool keepdims,
+                           bool flatten,
+                           int dtype,
+                           DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+#endif
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15bca474f58c38edc3faf3c4f55e5066439eb0ad
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -0,0 +1,217 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO,
+                                       const IndType* indices,
+                                       int64_t size,
+                                       T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
+template <typename T, typename IndType>
+static __global__ void FillGrad(const T* dO,
+                                const IndType* indices,
+                                T* dX,
+                                IndType num_rows,
+                                IndType num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
+    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
+      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
+    }
+  }
+}
+
+template <typename T, typename IndType>
+void ArgFullAssign(const phi::GPUContext& ctx,
+                   const DenseTensor* dO,
+                   const DenseTensor* indices,
+                   DenseTensor* dX,
+                   const IndType num_rows,
+                   const IndType num_cols) {
+  auto cu_stream = ctx.stream();
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(dO->data<T>(),
+                                                    indices->data<IndType>(),
+                                                    dX->data<T>(),
+                                                    num_rows,
+                                                    num_cols);
+}
+
+template <typename T>
+void ArgFlattenAssign(const phi::GPUContext& ctx,
+                      const DenseTensor* dO,
+                      const DenseTensor* indices,
+                      int64_t size,
+                      DenseTensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  dev_ctx.template Alloc<T>(in_grad);
+  if (out_grad.numel() == 0) return;
+  auto in_dims = in_grad->dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  int64_t size = in_grad->numel();
+
+  // Parallel acceleration when the input size is equal to the length of the
+  // ‘axis’ dimension.
+  // Compared to 'special case for full sort' below, the gradient calculation
+  // is 10 times faster.
+  if (size == in_dims[axis]) {
+    ArgFlattenAssign<T>(dev_ctx, &out_grad, &indices, size, in_grad);
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &out_grad, &indices, in_grad, input_height, input_width);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &trans_dO, &trans_ind, &tmp_out, input_height, input_width);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a9c1e275998b882e3146e8969dbfe266aebc7a5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -0,0 +1,310 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+// Iter for move to next row
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (T j = row_id; j < num_rows; j += gridDim.x) {
+    for (T i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+// Sort by flag descending, True: descending. False: Ascending.
+// Default is false.
+template <typename T, typename IndType>
+void ArgFullSort(const phi::GPUContext& ctx,
+                 const DenseTensor* input,
+                 DenseTensor* output,
+                 DenseTensor* indices,
+                 const IndType num_rows,
+                 const IndType num_cols,
+                 const bool descending) {
+  auto cu_stream = ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<IndType> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  ctx.template Alloc<IndType>(&input_indices);
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  // Init a index array
+  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<IndType>(), num_rows, num_cols);
+
+  T* sorted_out_ptr;
+  IndType* sorted_indices_ptr;
+  const T* inp = input->data<T>();
+  T* out = ctx.template Alloc<T>(output);
+  IndType* ind = ctx.template Alloc<IndType>(indices);
+  sorted_out_ptr = out;
+  sorted_indices_ptr = ind;
+
+  // create iter for counting input
+  cub::CountingInputIterator<IndType> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<IndType,
+                              SegmentOffsetIter,
+                              cub::CountingInputIterator<IndType>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  gpuError_t err;
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr,
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+
+  DenseTensor temp_storage;
+  int64_t temp_size = temp_storage_bytes;
+  temp_storage.Resize({temp_size});
+  ctx.template Alloc<uint8_t>(&temp_storage);
+
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(),
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.data<uint8_t>(),
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  const T* in_data = input.data<T>();
+  auto size = input.numel();
+  T* out_data = dev_ctx.template Alloc<T>(output);
+  int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  // Compared to the following 'Special case for full sort', ascending sort is
+  // 34 times faster and descending sort is 31 times faster.
+  if (size == in_dims[axis]) {
+    thrust::sequence(thrust::device, ids_data, ids_data + size);
+    thrust::copy(thrust::device, in_data, in_data + size, out_data);
+    thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+    if (descending) {
+      thrust::reverse(thrust::device, out_data, out_data + size);
+      thrust::reverse(thrust::device, ids_data, ids_data + size);
+    }
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &input,
+                            output,
+                            indices,
+                            input_height,
+                            input_width,
+                            descending);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    T* trans_inp_data = dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    // temp indices for sorting
+    tmp_indices.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    dev_ctx.template Alloc<int64_t>(indices);
+
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &trans_inp,
+                            &tmp_out,
+                            &tmp_indices,
+                            input_height,
+                            input_width,
+                            descending);
+
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a1bb9874fe19a21c46994b06779e6ed2ef5dc4f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -0,0 +1,258 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/auc_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+__global__ void ClearObsoleteDataKernel(int64_t *pos,
+                                        int64_t *neg,
+                                        const int bucket_length,
+                                        const int slide_steps) {
+  int cur_step_index =
+      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  CUDA_KERNEL_LOOP(i, bucket_length) {
+    pos[sum_step_begin + i] -= pos[cur_step_begin + i];
+    neg[sum_step_begin + i] -= neg[cur_step_begin + i];
+    pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0;
+  }
+}
+
+__global__ void UpdateSumDataKernel(int64_t *pos,
+                                    int64_t *neg,
+                                    const int bucket_length,
+                                    const int slide_steps) {
+  int cur_step_index =
+      static_cast<int>(pos[(slide_steps + 1) * bucket_length]) % slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+  CUDA_KERNEL_LOOP(i, bucket_length) {
+    pos[sum_step_begin + i] += pos[cur_step_begin + i];
+    neg[sum_step_begin + i] += neg[cur_step_begin + i];
+  }
+}
+
+template <typename T>
+__global__ void AddDataKernel(const int64_t *label_data,
+                              const T *pred_data,
+                              const int inference_width,
+                              const int num_thresholds,
+                              int64_t *pos,
+                              int64_t *neg,
+                              const int numel,
+                              const int slide_steps) {
+  int cur_step_begin = 0;
+  if (slide_steps > 0) {
+    int cur_step_index =
+        static_cast<int>(pos[(slide_steps + 1) * (1 + num_thresholds)]) %
+        slide_steps;
+    cur_step_begin = cur_step_index * (1 + num_thresholds);
+  }
+  CUDA_KERNEL_LOOP(i, numel) {
+    auto predict_data = pred_data[i * inference_width + (inference_width - 1)];
+    PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1.");
+    PADDLE_ENFORCE(predict_data >= 0,
+                   "The predict data must gather or equal 0.");
+    uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
+    if (label_data[i]) {
+      paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
+    } else {
+      paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
+    }
+  }
+}
+
+__global__ void CalcAucKernel(int64_t *stat_pos,
+                              int64_t *stat_neg,
+                              int num_thresholds,
+                              double *auc,
+                              bool need_add_batch_num) {
+  *auc = 0.0f;
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+
+  int idx = num_thresholds;
+
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += stat_pos[idx];
+    totNeg += stat_neg[idx];
+    *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0;
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    *auc = *auc / totPos / totNeg;
+  }
+  if (need_add_batch_num) {
+    stat_pos[num_thresholds + 1] += 1;
+    stat_neg[num_thresholds + 1] += 1;
+  }
+}
+
+inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) {
+  return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+}
+
+template <typename T, typename Context>
+void statAuc(const Context &dev_ctx,
+             const DenseTensor &label,
+             const DenseTensor &predict,
+             const int num_thresholds,
+             const int slide_steps,
+             int64_t *origin_stat_pos,
+             int64_t *origin_stat_neg) {
+  size_t batch_size = predict.dims()[0];
+  size_t inference_width = predict.dims()[1];
+  const T *inference_data = predict.data<T>();
+  const auto *label_data = label.data<int64_t>();
+  const int bucket_length = num_thresholds + 1;
+
+  if (slide_steps == 0) {
+    AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
+                        PADDLE_CUDA_NUM_THREADS,
+                    PADDLE_CUDA_NUM_THREADS,
+                    0,
+                    dev_ctx.stream()>>>(label_data,
+                                        inference_data,
+                                        inference_width,
+                                        num_thresholds,
+                                        origin_stat_pos,
+                                        origin_stat_neg,
+                                        batch_size,
+                                        slide_steps);
+    return;
+  }
+  // the last number of origin_stat_pos store the index should be used in
+  // current step
+  int cur_step_index =
+      static_cast<int>(origin_stat_pos[(slide_steps + 1) * bucket_length]) %
+      slide_steps;
+  int cur_step_begin = cur_step_index * bucket_length;
+  int sum_step_begin = slide_steps * bucket_length;
+
+  ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
+                                PADDLE_CUDA_NUM_THREADS,
+                            PADDLE_CUDA_NUM_THREADS,
+                            0,
+                            dev_ctx.stream()>>>(
+      origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
+
+  AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) /
+                      PADDLE_CUDA_NUM_THREADS,
+                  PADDLE_CUDA_NUM_THREADS,
+                  0,
+                  dev_ctx.stream()>>>(label_data,
+                                      inference_data,
+                                      inference_width,
+                                      num_thresholds,
+                                      origin_stat_pos,
+                                      origin_stat_neg,
+                                      batch_size,
+                                      slide_steps);
+  UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) /
+                            PADDLE_CUDA_NUM_THREADS,
+                        PADDLE_CUDA_NUM_THREADS,
+                        0,
+                        dev_ctx.stream()>>>(
+      origin_stat_pos, origin_stat_neg, bucket_length, slide_steps);
+}
+
+template <typename T, typename Context>
+void AucKernel(const Context &dev_ctx,
+               const DenseTensor &input,
+               const DenseTensor &label,
+               const DenseTensor &stat_pos,
+               const DenseTensor &stat_neg,
+               const std::string &curve,
+               int num_thresholds,
+               int slide_steps,
+               DenseTensor *auc,
+               DenseTensor *stat_pos_out,
+               DenseTensor *stat_neg_out) {
+  // Only use output var for now, make sure it's persistable and
+  // not cleaned up for each batch.
+  auto *origin_stat_pos = dev_ctx.template Alloc<int64_t>(stat_pos_out);
+  auto *origin_stat_neg = dev_ctx.template Alloc<int64_t>(stat_neg_out);
+  auto *auc_value = dev_ctx.template Alloc<double>(auc);
+
+  auto *stat_pos_in_tensor = &stat_pos;
+  auto *stat_neg_in_tensor = &stat_neg;
+  auto *pos_in_data = stat_pos.data<int64_t>();
+  auto *neg_in_data = stat_neg.data<int64_t>();
+#ifdef PADDLE_WITH_CUDA
+  if (stat_pos_in_tensor != stat_pos_out) {
+    cudaMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        cudaMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    cudaMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        cudaMemcpyDeviceToDevice);
+  }
+#else
+  if (stat_pos_in_tensor != stat_pos_out) {
+    hipMemcpy(
+        origin_stat_pos,
+        pos_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        hipMemcpyDeviceToDevice);
+  }
+  if (stat_neg_in_tensor != stat_neg_out) {
+    hipMemcpy(
+        origin_stat_neg,
+        neg_in_data,
+        ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
+            sizeof(int64_t),
+        hipMemcpyDeviceToDevice);
+  }
+#endif
+
+  statAuc<T, Context>(dev_ctx,
+                      label,
+                      input,
+                      num_thresholds,
+                      slide_steps,
+                      origin_stat_pos,
+                      origin_stat_neg);
+  int sum_offset = slide_steps * (num_thresholds + 1);
+  CalcAucKernel<<<1, 1, 0, dev_ctx.stream()>>>(origin_stat_pos + sum_offset,
+                                               origin_stat_neg + sum_offset,
+                                               num_thresholds,
+                                               auc_value,
+                                               slide_steps > 0);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(auc, GPU, ALL_LAYOUT, phi::AucKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c9ee5ede010367697bb9477a536f807625fd02b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -0,0 +1,1038 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon,
+                                        const int C,
+                                        const int HxW,
+                                        const int num,
+                                        T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T>
+static __global__ void KeBNRestoreData(const phi::DataLayout layout,
+                                       T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *variance,
+                                       double epsilon,
+                                       int C,
+                                       int M,
+                                       const int num,
+                                       const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+
+template <typename T>
+class InplaceHelper {
+ public:
+  void operator()(const phi::DataLayout layout,
+                  T *x,
+                  const BatchNormParamType<T> *scale,
+                  const BatchNormParamType<T> *bias,
+                  const BatchNormParamType<T> *mean,
+                  const BatchNormParamType<T> *variance,
+                  double epsilon,
+                  int C,
+                  int M,
+                  const int num,
+                  const T *y,
+                  int grid2,
+                  const int block,
+                  const gpuStream_t &stream) {
+    PADDLE_ENFORCE_EQ(x,
+                      y,
+                      phi::errors::InvalidArgument(
+                          "X and Y should be inplaced in inplace mode"));
+    KeBNRestoreData<<<grid2, block, 0, stream>>>(
+        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
+  }
+};
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    T *dx,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == phi::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean,
+    const T *x,
+    const BatchNormParamType<T> *variance,
+    const int C,
+    const int N,
+    const int HxW,
+    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> inv_var_i = variance[i];
+    BatchNormParamType<T> mean_i = mean[i];
+    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> dy_x_sub_mean_sum =
+        static_cast<BatchNormParamType<T>>(0);
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      dy_sum += dy_i;
+      dy_x_sub_mean_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+    }
+
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
+                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] =
+          (static_cast<BatchNormParamType<T>>(dy[index]) -
+           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
+           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
+               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
+          scale[i] * inv_var_i;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context &ctx,
+                            const DenseTensor &y_grad,
+                            const DenseTensor &x,
+                            const DenseTensor &scale,
+                            const DenseTensor &bias,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            paddle::optional<const DenseTensor &> reserve_space,
+                            paddle::optional<const DenseTensor &> mean,
+                            paddle::optional<const DenseTensor &> variance,
+                            float momentum,
+                            float epsilon_f,
+                            const std::string &data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor *x_grad,
+                            DenseTensor *scale_grad,
+                            DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto *d_y = &y_grad;
+
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  const auto &x_dims = x.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5."
+          "But received: the size of input's dimensions is [%d],"
+          "the dimensions of input is [%s]",
+          x_dims.size(),
+          x_dims));
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  if (d_scale && d_bias) {
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_EQ(
+      scale.dims().size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "The size of scale's dimensions must equal to 1. But received: "
+          "the size of scale's dimensions is [%d], the dimensions of scale "
+          "is [%s].",
+          scale.dims().size(),
+          scale.dims()));
+  PADDLE_ENFORCE_EQ(
+      scale.dims()[0],
+      C,
+      phi::errors::InvalidArgument(
+          "The first dimension of scale must equal to Channels[%d]. But "
+          "received: the first dimension of scale is [%d]",
+          C,
+          scale.dims()[0]));
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
+                                    FLAGS_cudnn_batchnorm_spatial_persistent &&
+                                    (reserve_space.get_ptr() != nullptr);
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_d_y(d_y->type());
+  DenseTensor transformed_d_x;
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    if (d_x) {
+      ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
+    }
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_d_y.ShareDataWith(*d_y);
+    if (d_x) {
+      transformed_d_x.ShareDataWith(*d_x);
+    }
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  }
+
+  const int num = transformed_x.numel();
+#ifdef HIPCC
+  const int block = 256;
+#else
+  const int block = 512;
+#endif
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid1 = (num + block - 1) / block;
+  int grid2 = std::min(C, max_blocks);
+  auto stream = ctx.stream();
+  InplaceHelper<T> inplace_functor;
+
+  if (!use_global_stats) {
+    if ((N * H * W * D) == 1) {
+      if (d_x) {
+        paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+      }
+      phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
+      functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(
+            &bn_param_desc_));
+#endif
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    } else if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#else
+    if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_,
+            CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4,
+            dims.data(),
+            strides.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+            bn_param_desc_, data_desc_, mode_));
+#endif
+
+    const auto *saved_mean_data =
+        saved_mean.template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_variance.template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      inplace_functor(compute_format,
+                      transformed_x.data<T>(),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      saved_mean_data,
+                      saved_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      transformed_x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    // This branch calls CUDNN APIs
+    if (d_x && d_scale && d_bias) {
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      auto reserve_space_size = reserve_space->memory_size();
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+                  /*handle=*/ctx.cudnn_handle(),
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*yDesc=*/data_desc_,
+                  /*dyDesc=*/data_desc_,
+                  /*dzDesc=*/nullptr,
+                  /*dxDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
+              /*handle=*/ctx.cudnn_handle(),
+              /*mode=*/mode_,
+              /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+              /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+              /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+              /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+              /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+              /*xDesc=*/data_desc_,
+              /*xData=*/transformed_x.template data<T>(),
+              /*yDesc=*/nullptr,
+              /*yData=*/nullptr,
+              /*dyDesc=*/data_desc_,
+              /*dyData=*/transformed_d_y.template data<T>(),
+              /*dzDesc=*/nullptr,
+              /*dzData=*/nullptr,
+              /*dxDesc=*/data_desc_,
+              /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
+              /*dBnScaleBiasDesc=*/bn_param_desc_,
+              /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
+              /*bnBiasData=*/nullptr,
+              /*dBnScaleData=*/d_scale
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*dBnBiasData=*/d_bias
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*epsilon=*/epsilon,
+              /*savedMean=*/saved_mean_data,
+              /*savedInvVariance=*/saved_var_data,
+              /*activationDesc=*/nullptr,
+              /*workspace=*/workspace_ptr,
+              /*workSpaceSizeInBytes=*/workspace_size,
+              /*reserveSpace=*/const_cast<T *>(
+                  reserve_space->template data<T>()),
+              /*reserveSpaceSizeInBytes=*/reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        if (compute_format == DataLayout::kNCHW) {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        } else {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationBackward(
+                ctx.cudnn_handle(),
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                transformed_d_y.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_d_x),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean_data,
+                saved_var_data));
+#endif
+      }
+
+      if (data_layout == DataLayout::kNHWC &&
+          compute_format == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+        TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
+      }
+    } else {
+      // This branch call CUDA kernels
+      if (compute_format == DataLayout::kNCHW) {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+    // clean when exit.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
+            bn_param_desc_));
+#endif
+  } else {
+    const auto *running_mean = mean.get_ptr();
+    const auto *running_var = variance.get_ptr();
+
+    const auto *running_mean_data =
+        running_mean->template data<BatchNormParamType<T>>();
+    const auto *running_var_data =
+        running_var->template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      auto px = x;
+      inplace_functor(data_layout,
+                      ctx.template Alloc<T>(&px),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      running_mean_data,
+                      running_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    if (compute_format == DataLayout::kNCHW) {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    } else {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &y_grad,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &saved_mean,
+                         const DenseTensor &saved_variance,
+                         paddle::optional<const DenseTensor &> reserve_space,
+                         paddle::optional<const DenseTensor &> mean,
+                         paddle::optional<const DenseTensor &> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string &data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context &ctx,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x,
+                               const DenseTensor &scale,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               paddle::optional<const DenseTensor &> mean,
+                               paddle::optional<const DenseTensor &> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string &data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor *x_grad,
+                               DenseTensor *scale_grad,
+                               DenseTensor *y_grad_grad) {
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const DenseTensor *running_mean = nullptr;
+  const DenseTensor *running_variance = nullptr;
+  if (use_global_stats) {
+    running_mean = mean.get_ptr();
+    running_variance = variance.get_ptr();
+  }
+  paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
+                                                       data_layout,
+                                                       &x,
+                                                       &scale,
+                                                       &y_grad,
+                                                       &saved_mean,
+                                                       &saved_variance,
+                                                       running_mean,
+                                                       running_variance,
+                                                       epsilon,
+                                                       use_global_stats,
+                                                       &x_grad_grad,
+                                                       &scale_grad_grad,
+                                                       &bias_grad_grad,
+                                                       x_grad,
+                                                       scale_grad,
+                                                       y_grad_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+
+#else
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..49b550f51e60e1cf31658f0d50afebf929a54079
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -0,0 +1,684 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardInference(const T *x,
+                                          const BatchNormParamType<T> *mean,
+                                          const BatchNormParamType<T> *variance,
+                                          const BatchNormParamType<T> *scale,
+                                          const BatchNormParamType<T> *bias,
+                                          const int C,
+                                          const int N,
+                                          const int HxW,
+                                          const double epsilon,
+                                          T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context &ctx,
+                     const DenseTensor &x,
+                     const DenseTensor &scale,
+                     const DenseTensor &bias,
+                     const DenseTensor &mean,
+                     const DenseTensor &variance,
+                     float momentum,
+                     float epsilon_f,
+                     const std::string &data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor *y,
+                     DenseTensor *mean_out,
+                     DenseTensor *variance_out,
+                     DenseTensor *saved_mean,
+                     DenseTensor *saved_variance,
+                     DenseTensor *reserve_space) {
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_stats);
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5"
+          "But received: the size of input's dimensions is [%d]",
+          x_dims.size()));
+
+  ctx.template Alloc<T>(y);
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm =
+      test_mode ||
+      (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
+
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_y(y->type());
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_y.ShareDataWith(*y);
+  }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+  if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+  } else if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#else
+  if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * D * C, 1, W * D * C, D * C, C};
+  }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  // Note: PERSISTENT not implemented for inference
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_,
+          data_desc_,
+          test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+#endif
+
+  auto handle = ctx.cudnn_handle();
+
+  // Now, depending on whether we are running test or not, we have two paths.
+  // It is training mode when it's not reference AND not using pre-trained
+  // model.
+  bool training = !test_mode && !use_global_stats;
+  if (!training) {
+    // only when test we use input to do computation.
+    const auto *est_mean = &mean;
+    const auto *est_var = &variance;
+    // Run inference mode.
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of mean's dimensions must equal to 1."
+            "But received: the size of mean's dimensions mean is [%d],"
+            "the dimensions of mean is [%s].",
+            est_mean->dims().size(),
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of variance's dimensions must equal to 1."
+            "But received: the size of variance's dimensions is [%d],"
+            "the dimensions of variance is [%s].",
+            est_var->dims().size(),
+            est_var->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of mean must equal to the number of "
+            "Channels, which is [%d]. But received: the first dimension"
+            "of mean is [%d], the dimensions of mean is [%s].",
+            C,
+            est_mean->dims()[0],
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of variance must equal to the number"
+            "of Channels, which is [%d]. But received: the first dimension of"
+            "variance is [%d], the dimensions of variance is [%s].",
+            C,
+            est_var->dims()[0],
+            est_var->dims()));
+
+#ifdef PADDLE_WITH_HIP
+    const int block_size = 256;
+    const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+    if (compute_format == DataLayout::kNCHW) {
+      BNForwardInference<
+          T,
+          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    } else {
+      BNForwardInference<
+          T,
+          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
+            handle,
+            // Note: PERSISTENT not implemented for inference
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            transformed_x.template data<T>(),
+            data_desc_,
+            ctx.template Alloc<T>(&transformed_y),
+            bn_param_desc_,
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            epsilon));
+#endif
+  } else {
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    // need to solve here
+    // if (ctx.HasInput("MomentumTensor")) {
+    //   const auto *mom_tensor = MomentumTensor;
+    //   DenseTensor mom_cpu;
+    //   paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+    //                                     &mom_cpu);
+    //   momentum = mom_cpu.data<float>()[0];
+    // }
+
+    // Run training mode.
+    // obtain running mean and running inv var, and there is no need
+    // to initialize them.
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    if ((N * H * W * D) == 1) {
+      // Only 1 element in normalization dimension,
+      // skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+    } else {
+      double this_factor = 1. - momentum;
+
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      size_t reserve_space_size = 0;
+      void *reserve_space_ptr = nullptr;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      DenseTensor reserve_space_tensor;
+      // Create reserve space and workspace for batch norm.
+      // Create tensor for each batchnorm op, it will be used in the
+      // backward. Thus this tensor shouldn't be temp.
+      // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      if (reserve_space == nullptr) {
+        reserve_space = &reserve_space_tensor;
+      }
+      PADDLE_ENFORCE_NOT_NULL(
+          reserve_space,
+          phi::errors::NotFound(
+              "The argument ReserveSpace of batch_norm op is not found."));
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*zDesc=*/nullptr,
+                  /*yDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      // -------------- cudnn batchnorm reserve space --------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*activationDesc=*/nullptr,
+                  /*xDesc=*/data_desc_,
+                  /*sizeInBytes=*/&reserve_space_size));
+
+      reserve_space_ptr = reserve_space->mutable_data(
+          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+              handle,
+              mode_,
+              CUDNN_BATCHNORM_OPS_BN,
+              CudnnDataType<T>::kOne(),
+              CudnnDataType<T>::kZero(),
+              data_desc_,
+              transformed_x.template data<T>(),
+              nullptr,
+              nullptr,
+              data_desc_,
+              transformed_y.template data<T>(),
+              bn_param_desc_,
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              this_factor,
+              mean_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              variance_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              epsilon,
+              saved_mean->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              saved_variance->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              nullptr,
+              workspace_ptr,
+              workspace_size,
+              reserve_space_ptr,
+              reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        const int num = transformed_x.numel();
+        const int block = 256;
+        const int max_threads = ctx.GetMaxPhysicalThreadCount();
+        const int max_blocks = std::max(max_threads / block, 1);
+        const int grid = std::min(C, max_blocks);
+        if (compute_format == DataLayout::kNCHW) {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        } else {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                mean_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                variance_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                saved_variance->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace())));
+#endif
+      }
+    }
+  }
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    TransToChannelLast<Context, T>(ctx, &transformed_y, y);
+  }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9c62026edfa7ad4cd3124cc5a612f8220ab00f5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_utils.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 2b6140d2fde0d3bcef3f15c4414444f1d2099b2e..79d8a7b0f3444b4272d1affd67bd5ac943f2c1cc 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/bernoulli_kernel.h"
+
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #ifdef __NVCC__
@@ -28,7 +30,6 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/bernoulli_kernel.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4ec894790cd38d4c769021b786ab2e756d9bd30
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename InputT, typename OutT>
+__global__ void KernelBincount(const InputT* input,
+                               const int total_elements,
+                               const bool has_weights,
+                               const T* weights,
+                               OutT* output) {
+  if (!has_weights) {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+    }
+  } else {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]],
+                                      static_cast<OutT>(weights[i]));
+    }
+  }
+}
+
+template <typename Context, typename T, typename InputT>
+void BincountCUDAInner(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const paddle::optional<const DenseTensor&> weights,
+                       int minlength,
+                       DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  const int input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<T>(output);
+    return;
+  }
+  auto input_x = EigenVector<InputT>::Flatten(*input);
+  DenseTensor input_min_t, input_max_t;
+  input_max_t.Resize({1});
+  auto* input_max_data = dev_ctx.template Alloc<InputT>(&input_max_t);
+  input_min_t.Resize({1});
+  auto* input_min_data = dev_ctx.template Alloc<InputT>(&input_min_t);
+
+  auto input_max_scala = EigenScalar<InputT>::From(input_max_t);
+  auto input_min_scala = EigenScalar<InputT>::From(input_min_t);
+
+  auto* place = dev_ctx.eigen_device();
+  input_max_scala.device(*place) = input_x.maximum();
+  input_min_scala.device(*place) = input_x.minimum();
+
+  DenseTensor input_min_cpu, input_max_cpu;
+  paddle::framework::TensorCopySync(
+      input_max_t, phi::CPUPlace(), &input_max_cpu);
+  paddle::framework::TensorCopySync(
+      input_min_t, phi::CPUPlace(), &input_min_cpu);
+
+  InputT input_min = input_min_cpu.data<InputT>()[0];
+
+  PADDLE_ENFORCE_GE(
+      input_min,
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size =
+      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
+
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
+  auto stream = dev_ctx.stream();
+
+  if (!has_weights) {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+
+    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+        input_data, input_numel, has_weights, weights_data, output_data);
+  } else {
+    const auto& weights_type =
+        paddle::framework::TransToProtoVarType(weights->dtype());
+
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+
+      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
+                                          PADDLE_CUDA_NUM_THREADS,
+                                          0,
+                                          stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountCUDAInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountCUDAInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/bitwise_kernel.cu b/paddle/phi/kernels/gpu/bitwise_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e88ecef318a874d338094dfc9575b732cdd7680a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bitwise_kernel.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/bitwise_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/bitwise_functors.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+namespace phi {
+
+#define DEFINE_BITWISE_KERNEL(op_type)                      \
+  template <typename T, typename Context>                   \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,     \
+                                const DenseTensor& x,       \
+                                const DenseTensor& y,       \
+                                DenseTensor* out) {         \
+    dev_ctx.template Alloc<T>(out);                         \
+    funcs::Bitwise##op_type##Functor<T> func;               \
+    std::vector<const DenseTensor*> ins = {&x, &y};         \
+    std::vector<DenseTensor*> outs = {out};                 \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>( \
+        dev_ctx, ins, &outs, -1, func);                     \
+  }
+
+DEFINE_BITWISE_KERNEL(And)
+DEFINE_BITWISE_KERNEL(Or)
+DEFINE_BITWISE_KERNEL(Xor)
+#undef DEFINE_BITWISE_KERNEL
+
+template <typename T, typename Context>
+void BitwiseNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BitwiseNotFunctor<T> func;
+  funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+      dev_ctx, ins, &outs, -1, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bitwise_and,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseAndKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_or,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseOrKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_xor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseXorKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(bitwise_not,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BitwiseNotKernel,
+                   bool,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 6fb24d72145c67be2ad1d25620e7886326e8cd6f..d4850b74477d29e868698e000fdc01e708d172b5 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -20,14 +20,14 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
-                                const std::vector<DenseTensor>& dout,
+                                const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx) {
   // Find reduce dimensions
   const auto& in_tensors = dout;
@@ -54,7 +54,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
   // For each In-Out tensor pair,
   // Prepare and apply broadcast dims array
   for (size_t i = 0; i < num_ins; i++) {
-    auto* input_tensor = &in_tensors[i];
+    auto* input_tensor = in_tensors[i];
     auto* output_tensor = out_tensors[i];
 
     const DDim& input_dims = input_tensor->dims();
@@ -87,13 +87,12 @@ void BroadcastTensorsGradKernel(const Context& ctx,
           *input_tensor, ctx.GetPlace(), ctx, output_tensor);
     } else {
       // reduce_sum implementation on CUDA
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx,
           *input_tensor,
           output_tensor,
           kps::IdentityFunctor<T>(),
-          reduce_dims_vec,
-          ctx.stream());
+          reduce_dims_vec);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 569a46f56d5638584262c0d1c8002459fa8ffd70..542234c80b5a1e945aec7c8342d31ef9b676cce8 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -20,11 +20,11 @@
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..82b1282cc36dc01a6d8b5cad31f6c97c570858f3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// backward reuse forward, HIP not support forward
+
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cholesky_solve_grad,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveGradKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f1c91f3824780060852aef6e96d97b0e635f37a5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T>
+void cusolver_potrs(const solverHandle_t &handle,
+                    cublasFillMode_t uplo,
+                    int M,
+                    int N,
+                    T *Adata,
+                    int lda,
+                    T *Bdata,
+                    int ldb,
+                    int *devInfo);
+
+template <>
+void cusolver_potrs<float>(const solverHandle_t &handle,
+                           cublasFillMode_t uplo,
+                           int M,
+                           int N,
+                           float *Adata,
+                           int lda,
+                           float *Bdata,
+                           int ldb,
+                           int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<double>(const solverHandle_t &handle,
+                            cublasFillMode_t uplo,
+                            int M,
+                            int N,
+                            double *Adata,
+                            int lda,
+                            double *Bdata,
+                            int ldb,
+                            int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDpotrs(
+      handle, uplo, M, N, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<float>>(
+    const solverHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<float> *Adata,
+    int lda,
+    phi::dtype::complex<float> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCpotrs(handle,
+                                uplo,
+                                M,
+                                N,
+                                reinterpret_cast<const cuComplex *>(Adata),
+                                lda,
+                                reinterpret_cast<cuComplex *>(Bdata),
+                                ldb,
+                                devInfo));
+}
+
+template <>
+void cusolver_potrs<phi::dtype::complex<double>>(
+    const cusolverDnHandle_t &handle,
+    cublasFillMode_t uplo,
+    int M,
+    int N,
+    phi::dtype::complex<double> *Adata,
+    int lda,
+    phi::dtype::complex<double> *Bdata,
+    int ldb,
+    int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
+      handle,
+      uplo,
+      M,
+      N,
+      reinterpret_cast<const cuDoubleComplex *>(Adata),
+      lda,
+      reinterpret_cast<cuDoubleComplex *>(Bdata),
+      ldb,
+      devInfo));
+}
+
+template <typename T>
+class CholeskySolveFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T *Adata,
+                  int lda,
+                  T *Bdata,
+                  int *devInfo) {
+    cublasFillMode_t uplo =
+        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+    auto handle = dev_ctx.cusolver_dn_handle();
+    cusolver_potrs<T>(handle, uplo, M, N, Adata, lda, Bdata, lda, devInfo);
+  }
+};
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky_solve,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskySolveKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..225164687b75ca88f0c0783d6bdabea227c076ae
--- /dev/null
+++ b/paddle/phi/kernels/gpu/compare_kernel.cu
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/compare_kernel.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
+
+#include <thrust/fill.h>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+struct BitwiseAdd {
+  // Bitwise add operator, returns <tt>a + b</tt>
+  inline T initial() { return static_cast<T>(true); }
+
+  __host__ __device__ __forceinline__ T operator()(const T& a,
+                                                   const T& b) const {
+    return a & b;
+  }
+};
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out) {
+  ctx.template Alloc<bool>(out);
+  std::vector<const DenseTensor*> ins{&x, &y};
+  std::vector<DenseTensor*> outs{out};
+  funcs::BroadcastKernel<ElementwiseType::kBinary, T, bool>(
+      ctx, ins, &outs, axis, Functor());
+}
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  bool* out_data = ctx.template Alloc<bool>(out);
+
+  if (x.dims() != y.dims()) {
+    thrust::device_ptr<bool> out_dev_ptr(out_data);
+    thrust::fill(out_dev_ptr, out_dev_ptr + 1, false);
+    return;
+  }
+
+  DenseTensor tmp;
+  tmp.Resize(x.dims());
+  ctx.template Alloc<bool>(&tmp);
+
+  std::vector<const DenseTensor*> ins{&x, &y};
+  std::vector<DenseTensor*> outs{&tmp};
+  funcs::ElementwiseKernel<bool>(ctx, ins, &outs, Functor());
+
+  // Reduce by 'bitwise and' operator
+  std::vector<int> reduce_dims;
+  reduce_dims.resize(tmp.dims().size());
+  for (int i = 0; i < reduce_dims.size(); ++i) {
+    reduce_dims[i] = i;
+  }
+  funcs::ReduceKernel<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
+      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(less_than,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LessThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(less_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LessEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_than,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GreaterThanKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(greater_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GreaterEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(not_equal,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NotEqualKernel,
+                   bool,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(equal_all,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EqualAllKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 2b04b979c20aa71cc723610d013cd12fb5537a29..accb1cc3d77e3ccd14b4d7808b781cf255eddd06 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -29,16 +29,16 @@ namespace phi {
 
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
-                  const std::vector<DenseTensor>& x,
+                  const std::vector<const DenseTensor*>& x,
                   const Scalar& axis_scalar,
                   DenseTensor* out) {
   int64_t axis = axis_scalar.to<int64_t>();
 
-  axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
 
   std::vector<phi::DDim> x_dims;
   for (size_t i = 0; i < x.size(); ++i) {
-    x_dims.push_back(x[i].dims());
+    x_dims.push_back(x[i]->dims());
   }
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
@@ -46,13 +46,13 @@ void ConcatKernel(const Context& dev_ctx,
   out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
-  if (axis == 0 && x[0].lod().size() > 0) {
-    size_t lod_size_0 = x[0].lod().size();
+  if (axis == 0 && x[0]->lod().size() > 0) {
+    size_t lod_size_0 = x[0]->lod().size();
     size_t lod_size = lod_size_0;
     for (size_t i = 1; i < x.size(); ++i) {
-      if (x[i].lod().size() > 0) {
+      if (x[i]->lod().size() > 0) {
         PADDLE_ENFORCE_EQ(
-            x[i].lod().size(),
+            x[i]->lod().size(),
             lod_size_0,
             phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
@@ -60,7 +60,7 @@ void ConcatKernel(const Context& dev_ctx,
                 "it is not supported currently. The lod level of %dth input "
                 "is %d and first input is %d.",
                 i,
-                x[i].lod().size(),
+                x[i]->lod().size(),
                 lod_size_0));
       } else {
         lod_size = 0;
@@ -70,7 +70,7 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod());
+        auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod());
         phi::AppendLoD(out_lod, in_lod);
       }
     }
@@ -79,18 +79,18 @@ void ConcatKernel(const Context& dev_ctx,
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && x.size() < 10) {
     size_t output_offset = 0;
-    for (auto& in : x) {
-      if (in.numel() == 0UL) {
+    for (auto* in : x) {
+      if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in.dims());
+      auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
       paddle::operators::StridedNumelCopyWithAxis<T>(
           dev_ctx,
           axis,
           out->data<T>() + output_offset,
           out_stride,
-          in.data<T>(),
+          in->data<T>(),
           in_stride,
           in_stride[axis]);
       output_offset += in_stride[axis];
@@ -98,8 +98,8 @@ void ConcatKernel(const Context& dev_ctx,
   } else {
     std::vector<phi::DenseTensor> inputs;
     for (size_t j = 0; j < x.size(); ++j) {
-      if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+      if (x[j]->numel() > 0) {
+        inputs.push_back(*x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6449a193a082e5d17926de9252a79e4c069be224
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4df7bb26adf845b4a8f52f1c92beb8621002b3da
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(
+    conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..680ee4426af0661d39d4c7bd0abf9e52c4594995
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+
+PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/phi/kernels/gpu/conv_test_kernel.cu
similarity index 83%
rename from paddle/infrt/dialect/pd_types.cc
rename to paddle/phi/kernels/gpu/conv_test_kernel.cu
index 94856e362d301978970279846907f41dfbc00b56..0544a1e298b8e7dc871d13f546398a5c28308b0e 100644
--- a/paddle/infrt/dialect/pd_types.cc
+++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,5 +11,3 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/infrt/dialect/pd_types.h"
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e8712462928d43077766a2fd03aee51d9a4cd8c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -0,0 +1,320 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include <thrust/transform.h>
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+
+template <typename T>
+struct CumprodGradFunctorExceptFirstZero {
+  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
+      const T *x,
+      const T *y,
+      const T *dy_mul_y_reversed_cumsum,
+      const uint8_t *zero_mask,
+      size_t mid_dim,
+      size_t inner_dim,
+      T *dx,
+      int64_t *first_zero_idx,
+      T *x_filled_one)
+      : x_(x),
+        y_(y),
+        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
+        zero_mask_(zero_mask),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx),
+        first_zero_idx_(first_zero_idx),
+        x_filled_one_(x_filled_one) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto inner_idx = idx % inner_dim_;
+    auto outer_idx = idx / (mid_dim_ * inner_dim_);
+    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
+    auto mask = zero_mask_[idx];
+    bool should_fill_one = true;
+
+    if (mask == 0) {
+      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
+      if (mid_idx == mid_dim_ - 1) {
+        // record first zero position as -1, i.e., no zero
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
+      }
+    } else if (mid_idx > 0) {                  // mask > 0
+      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
+        dx_[idx] = 0;
+        should_fill_one = false;
+      } else {
+        // idx is the first zero position, it should be recorded
+        dx_[idx] = y_[idx - inner_dim_];
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
+      }
+    } else {  // the first zero position is index 0
+      dx_[idx] = 1;
+      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
+    }
+
+    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
+  }
+
+ private:
+  const T *x_;
+  const T *y_;
+  const T *dy_mul_y_reversed_cumsum_;
+  const uint8_t *zero_mask_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+  int64_t *first_zero_idx_;
+  T *x_filled_one_;
+};
+
+template <typename T>
+struct FillFirstZeroPositionGradFunctor {
+  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
+                                              const T *grad_value,
+                                              size_t mid_dim,
+                                              size_t inner_dim,
+                                              T *dx)
+      : first_zero_idx_(first_zero_idx),
+        grad_value_(grad_value),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto outer_idx = idx / inner_dim_;
+    auto inner_idx = idx % inner_dim_;
+    auto mid_idx = first_zero_idx_[idx];
+    if (mid_idx >= 0) {
+      auto full_idx =
+          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
+      dx_[full_idx] *= grad_value_[full_idx];
+    }
+  }
+
+ private:
+  const int64_t *first_zero_idx_;
+  const T *grad_value_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+};
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &out,
+                       const DenseTensor &dout,
+                       int dim,
+                       DenseTensor *dx) {
+  const auto *y = &out;
+  const auto *dy = &dout;
+
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x.dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+  if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
+
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  const auto *x_data = x.data<T>();
+  const auto *y_data = y->data<T>();
+  const auto *dy_data = dy->data<T>();
+
+  auto place = dev_ctx.GetPlace();
+  auto *dx_data = dev_ctx.template Alloc<T>(dx);
+
+  // deal with complex
+  const T *x_data_deal;
+  const T *y_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr y_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
+    y_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_y(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
+    for_range_y(functor_y);
+    x_data_deal = x_data_conj;
+    y_data_deal = y_data_conj;
+  } else {
+    x_data_deal = x_data;
+    y_data_deal = y_data;
+  }
+
+// Step 1: find cummax-ed zero mask of x
+#ifdef PADDLE_WITH_CUDA
+  const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
+#endif
+  auto zero_mask_without_cummax =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_without_cummax_data =
+      reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(x_data_deal),
+                    thrust::device_pointer_cast(x_data_deal) + numel,
+                    thrust::device_pointer_cast(zero_mask_without_cummax_data),
+                    funcs::IsZeroFunctor<T>());
+
+  auto zero_mask = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                       .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
+  paddle::operators::math::InclusiveScan<uint8_t, cub::Max>(
+      zero_mask_without_cummax_data,
+      zero_mask_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<uint8_t>(0),
+      cub::Max(),
+      /*reverse=*/false,
+      dev_ctx);
+  zero_mask_without_cummax = nullptr;
+
+  // Step 2: calculate reversed cumsum(dy * y)
+  auto dy_mul_y = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                      .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(y_data_deal),
+                    thrust::device_pointer_cast(dy_mul_y_data),
+                    funcs::MultiplyFunctor<T>());
+
+  auto dy_mul_y_reversed_cumsum =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_reversed_cumsum_data =
+      reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_y_data,
+      dy_mul_y_reversed_cumsum_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 3: calculate the gradient value except the first zero position.
+  // The gradient value of the first zero position is filled with out[idx-1],
+  // while the gradient value of the other positions are calculated out
+  // completely. This functor also:
+  //  (1) find the first zero index, i.e., first_zero_idx_data.
+  //  (2) fill x_filled_one, which satifies
+  //      x_filled_one[i] = x[i], i > pos
+  //      x_filled_one[i] = 1, i <= pos
+  auto first_zero_idx = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                            .Allocate(numel * sizeof(int64_t));
+  auto *first_zero_idx_data =
+      reinterpret_cast<int64_t *>(first_zero_idx->ptr());
+  auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
+      x_data_deal,
+      y_data_deal,
+      dy_mul_y_reversed_cumsum_data,
+      zero_mask_data,
+      mid_dim,
+      inner_dim,
+      dx_data,
+      first_zero_idx_data,
+      x_filled_one_data);
+  for_range(functor_except_first_zero);
+
+  // Step 4: calculate cumprod of x_filled_one
+  auto *x_filled_one_cumprod_data =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, funcs::MultiplyFunctor<T>>(
+      x_filled_one_data,
+      x_filled_one_cumprod_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(1),
+      funcs::MultiplyFunctor<T>(),
+      /*reverse=*/false,
+      dev_ctx);
+
+  // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
+  auto *dy_mul_x_filled_one_cumprod =
+      dy_mul_y_data;  // reuse former allocated memory
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(x_filled_one_cumprod_data),
+                    thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
+                    funcs::MultiplyFunctor<T>());
+  auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_x_filled_one_cumprod,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 6: fill zero pos gradient value
+  phi::funcs::ForRange<Context> for_range_fill_zero_pos_grad(
+      dev_ctx, outer_dim * inner_dim);
+  FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
+      first_zero_idx_data,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      mid_dim,
+      inner_dim,
+      dx_data);
+  for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1bbf8972a24798e5ddebf4dac2b3745eb1a2aee0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context &dev_ctx,
+                   const DenseTensor &input,
+                   int dim,
+                   DenseTensor *out) {
+  const auto *x = &input;
+  auto *y = out;
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+
+  const auto *x_data = x->data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  paddle::operators::math::InclusiveScan(x_data,
+                                         y_data,
+                                         outer_dim,
+                                         mid_dim,
+                                         inner_dim,
+                                         static_cast<T>(1),
+                                         funcs::MultiplyFunctor<T>(),
+                                         /*reverse=*/false,
+                                         dev_ctx);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a253e6f4ad290bc45dee8f57afee06363042a8c5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -0,0 +1,336 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, int BLOCK_SIZE>
+__device__ void BlockReverse(
+    const T* idata, T* odata, int src_base, int dst_base, int valid_item) {
+  __shared__ T sh_mem[BLOCK_SIZE];
+  int tx = threadIdx.x;
+
+  int offset = tx;
+  int in_index = src_base + offset;
+  if (offset >= valid_item) {
+    sh_mem[offset] = 0;
+  } else {
+    int sh_mem_index = BLOCK_SIZE - offset - 1;
+    T data = idata[in_index];
+    sh_mem[sh_mem_index] = data;
+  }
+
+  __syncthreads();
+  int out_index = dst_base - offset;
+  if (offset < valid_item) {
+    int sh_mem_index = BLOCK_SIZE - offset - 1;
+    odata[out_index] = sh_mem[sh_mem_index];
+  }
+}
+
+template <typename T>
+__global__ void MatrixRowReverse(const T* matrix_data,
+                                 T* reverse_data,
+                                 int reverse_size,
+                                 int outer_size,
+                                 int inner_size) {
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+  int item_per_block = 1024;
+
+  for (int block_offset = 0; block_offset < reverse_size;
+       block_offset += item_per_block) {
+    int valid_item = (reverse_size - block_offset > item_per_block)
+                         ? item_per_block
+                         : reverse_size - block_offset;
+    int src_offset =
+        bx * reverse_size + block_offset + by * (inner_size * reverse_size);
+    int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) +
+                     reverse_size - 1 - block_offset;
+    if (reverse_size < item_per_block) {
+      valid_item = reverse_size;
+    }
+
+    BlockReverse<T, 1024>(
+        matrix_data, reverse_data, src_offset, dst_offset, valid_item);
+  }
+}
+
+template <typename T>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total;
+  // Constructor
+  __device__ BlockPrefixCallbackOp(T running_total)
+      : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total;
+    running_total = old_prefix + block_aggregate;
+    return old_prefix;
+  }
+};
+
+// No bank-conflict transpose
+template <typename T, int TILE_DIM, int BLOCK_ROWS>
+__global__ void MatrixTranspose(T* odata,
+                                const T* idata,
+                                size_t height,
+                                size_t width) {
+  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
+
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+    if (x < width && (y + j) < height) {
+      tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x];
+    } else {
+      tile[threadIdx.y + j][threadIdx.x] = 0;
+    }
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * TILE_DIM + threadIdx.y;
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+    if (x < height && (y + j) < width) {
+      odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void BlockScanKernel(T* d_out,
+                                const T* d_in,
+                                int inner_size,
+                                int outer_size,
+                                int scan_size,
+                                bool exclusive) {
+  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
+  typedef cub::
+      BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
+          BlockLoadT;
+  typedef cub::
+      BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE>
+          BlockStoreT;
+  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
+  // Allocate type-safe, repurposable shared memory for collectives
+  __shared__ union {
+    typename BlockLoadT::TempStorage load;
+    typename BlockStoreT::TempStorage store;
+    typename BlockScanT::TempStorage scan;
+  } temp_storage;
+
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+
+  BlockPrefixCallbackOp<T> prefix_op(0);
+  T block_aggregate = static_cast<T>(0);
+
+  // Obtain this block's segment of consecutive keys (blocked across threads)
+  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
+  for (int block_offset = 0; block_offset < scan_size;
+       block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) {
+    int valid_item = (scan_size - block_offset > item_per_block)
+                         ? item_per_block
+                         : (scan_size - block_offset);
+    if (scan_size < item_per_block) {
+      valid_item = scan_size;
+    }
+
+    int offset = bx * scan_size + block_offset + by * (inner_size * scan_size);
+
+    T thread_keys[ITEMS_PER_THREAD];
+    BlockLoadT(temp_storage.load)
+        .Load(d_in + offset, thread_keys, valid_item, 0);
+
+    __syncthreads();
+    if (exclusive) {
+      T init_value = static_cast<T>(0);
+      BlockScanT(temp_storage.scan)
+          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+    } else {
+      BlockScanT(temp_storage.scan)
+          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+    }
+    __syncthreads();
+
+    BlockStoreT(temp_storage.store)
+        .Store(d_out + offset, thread_keys, valid_item);
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  auto out_dims = out->dims();
+  auto size = x.numel();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  const T* in_data = x.data<T>();
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  if (size == out_dims[axis]) {
+    if (reverse) {
+      thrust::device_ptr<const T> dev_ptr =
+          thrust::device_pointer_cast(in_data);
+      thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
+      if (exclusive) {
+        thrust::exclusive_scan(
+            thrust::device, vec.rbegin(), vec.rend(), out_data);
+      } else {
+        thrust::inclusive_scan(
+            thrust::device, vec.rbegin(), vec.rend(), out_data);
+      }
+      thrust::reverse(thrust::device, out_data, out_data + size);
+    } else {
+      if (exclusive) {
+        thrust::exclusive_scan(
+            thrust::device, in_data, in_data + size, out_data);
+      } else {
+        thrust::inclusive_scan(
+            thrust::device, in_data, in_data + size, out_data);
+      }
+    }
+    return;
+  }
+
+  size_t height = 1;
+  size_t width = 1;
+  for (size_t i = 0; i <= axis; i++) {
+    height *= out_dims[i];
+  }
+
+  for (size_t i = axis + 1; i < out_dims.size(); i++) {
+    width *= out_dims[i];
+  }
+  int scan_size = out_dims[axis];
+  bool transpose = (axis != out_dims.size() - 1);
+
+  int tile_size = 32;
+  dim3 blocks(32, 8);
+  dim3 transpose_grids((width + tile_size - 1) / tile_size,
+                       (height + tile_size - 1) / tile_size);
+  out->Resize(out_dims);
+  auto* tmp_data = out->data<T>();
+
+  T* next_in_data = out_data;
+  T* next_out_data = tmp_data;
+  if (transpose) {
+    MatrixTranspose<T, 32, 8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
+        out_data, in_data, height, width);
+    next_in_data = out_data;
+    next_out_data = tmp_data;
+  }
+  auto swap_ptr = [](T*& ptr1, T*& ptr2) {
+    T* tmp = ptr2;
+    ptr2 = ptr1;
+    ptr1 = tmp;
+  };
+  int outer_size = height / scan_size;
+  int inner_size = width;
+  // Consider the size of shared memory, here block size is 128
+  dim3 scan_grid(outer_size, inner_size);
+  dim3 reverse_grid = scan_grid;
+  if (reverse) {
+    if (transpose) {
+      reverse_grid.x = scan_grid.y;
+      reverse_grid.y = scan_grid.x;
+      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+          next_in_data, next_out_data, scan_size, outer_size, inner_size);
+      if (!transpose) next_in_data = tmp_data;
+      swap_ptr(next_in_data, next_out_data);
+    } else {
+      MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+          in_data, out_data, scan_size, outer_size, inner_size);
+    }
+  }
+  if (!transpose && !reverse) {
+    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        out_data, in_data, outer_size, inner_size, scan_size, exclusive);
+
+  } else {
+    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        next_out_data,
+        next_in_data,
+        outer_size,
+        inner_size,
+        scan_size,
+        exclusive);
+  }
+  swap_ptr(next_in_data, next_out_data);
+  if (reverse) {
+    MatrixRowReverse<T><<<reverse_grid, 1024, 0, dev_ctx.stream()>>>(
+        next_in_data, next_out_data, scan_size, outer_size, inner_size);
+    swap_ptr(next_in_data, next_out_data);
+  }
+  if (transpose) {
+    transpose_grids.x = (height + tile_size - 1) / tile_size;
+    transpose_grids.y = (width + tile_size - 1) / tile_size;
+    MatrixTranspose<T, 32, 8><<<transpose_grids, blocks, 0, dev_ctx.stream()>>>(
+        next_out_data, next_in_data, width, height);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/phi/kernels/gpu/depthwise_conv.h
similarity index 62%
rename from paddle/fluid/operators/math/depthwise_conv.cu
rename to paddle/phi/kernels/gpu/depthwise_conv.h
index a4665a8f9a62dde6bfdbad3b05d7065e05f0a92f..5270a4b2fdb8d77aa1dfb20a166a9676b007c93f 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
+#pragma once
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -21,7 +25,7 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/math/depthwise_conv.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -30,6 +34,58 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+using DataLayout = framework::DataLayout;
+
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* input_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* filter_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
 template <typename T>
 static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) {
   typedef cub::WarpReduce<T> WarpReduce;
@@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
@@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   }
   if (c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
+      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(input_data,
+                                                        filter_data,
+                                                        batch_size,
+                                                        output_channels,
+                                                        output_height,
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
     } else {
-      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data);
+      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(input_data,
+                                                        filter_data,
+                                                        batch_size,
+                                                        output_channels,
+                                                        output_height,
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           output_data);
     } else {
       KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           output_data);
     }
   }
@@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   int final_filter_multiplier = filter_multiplier;
@@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp(
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, final_filter_multiplier, filter_height,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     } else {
       KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, final_filter_multiplier, filter_height,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNCHW<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                               fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, c_filter_multiplier, filter_height,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     } else {
-      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNHWC<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                               fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
-          output_channels, output_height, output_width, input_channels,
-          input_height, input_width, c_filter_multiplier, filter_height,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
-          dilate_height, dilate_width, input_grad_data);
+          input_data,
+          output_grad_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
     }
   }
 }
@@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp(
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   T s = 0;
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
 
@@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
 
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   int bid = blockIdx.z;
   int image_h = blockIdx.y;
   int kernel_iw = blockIdx.x % filter_width;
@@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
 
 template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+    const T* output_grad_data,
+    const T* input_data,
+    const int num,
+    const int output_channels,
+    const int output_height,
+    const int output_width,
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
   const int bid = blockIdx.z;
   int image_h = blockIdx.x * dilate_height + blockIdx.y;
   if (image_h >= output_height) {
@@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          DataLayout data_layout, bool fuse_relu_before_conv>
-__global__ void KernelDepthwiseConvFilterGradSp(
-    const T* output_grad_data, const T* input_data, const int num,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data) {
+template <typename T,
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
+__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
+                                                const T* input_data,
+                                                const int num,
+                                                const int output_channels,
+                                                const int output_height,
+                                                const int output_width,
+                                                const int input_channels,
+                                                const int input_height,
+                                                const int input_width,
+                                                const int filter_multiplier,
+                                                const int filter_height,
+                                                const int filter_width,
+                                                const int stride_height,
+                                                const int stride_width,
+                                                const int padding_height,
+                                                const int padding_width,
+                                                const int dilate_height,
+                                                const int dilate_width,
+                                                T* filter_grad_data) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
   int w_stride = stride_width;
@@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp(
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     } else {
       KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     }
   } else {
     if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     } else {
-      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+      KernelDepthwiseConvFilterGradCFilterNHWC<T,
+                                               c_filter,
                                                fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
-          output_width, input_channels, input_height, input_width,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_grad_data,
+          input_data,
+          num,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
           filter_grad_data);
     }
   }
@@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp(
  * height and width, respectively.
  */
 template <class T, bool fuse_relu_before_conv>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
-                           fuse_relu_before_conv> {
+class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
                   const DataLayout data_layout = DataLayout::kNCHW) {
     const int batch_size = input.dims()[0];
     const int input_channels =
@@ -905,12 +1199,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 
     framework::Tensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
-                                       filter.dims()[0], filter.dims()[1]});
+      framework::DDim filter_hwc_dims({filter.dims()[2],
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
           ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
       threads = dim3(std::min(output_channels, thread), blocks, 1);
       grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
     int nums_output =
@@ -952,37 +1249,73 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 #endif
     int grid_size = (nums_output + block_size - 1) / block_size;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
-  if (c_filter_multiplier == 0 ||                                              \
-      filter_multiplier == c_filter_multiplier &&                              \
-          stride_height == stride_width && stride_height == c_stride &&        \
-          (ksize_height == ksize_width && ksize_height == c_filter ||          \
-           c_filter == -1)) {                                                  \
-    if (c_filter == -1) {                                                      \
-      threads.x = block_size;                                                  \
-      grid.x = grid_size;                                                      \
-      threads.y = threads.z = grid.y = grid.z = 1;                             \
-    }                                                                          \
-    if (data_layout != DataLayout::kNHWC) {                                    \
-      KernelDepthwiseConvSp<                                                   \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          input_data, filter_data, batch_size, output_channels, output_height, \
-          output_width, input_channels, input_height, input_width,             \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
-          stride_width, padding_height, padding_width, dilate_height,          \
-          dilate_width, output_data);                                          \
-    } else {                                                                   \
-      KernelDepthwiseConvSp<                                                   \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          input_data, filter_data, batch_size, output_channels, output_height, \
-          output_width, input_channels, input_height, input_width,             \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
-          stride_width, padding_height, padding_width, dilate_height,          \
-          dilate_width, output_data);                                          \
-    }                                                                          \
-    return;                                                                    \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
+  if (c_filter_multiplier == 0 ||                                         \
+      filter_multiplier == c_filter_multiplier &&                         \
+          stride_height == stride_width && stride_height == c_stride &&   \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
+           c_filter == -1)) {                                             \
+    if (c_filter == -1) {                                                 \
+      threads.x = block_size;                                             \
+      grid.x = grid_size;                                                 \
+      threads.y = threads.z = grid.y = grid.z = 1;                        \
+    }                                                                     \
+    if (data_layout != DataLayout::kNHWC) {                               \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    } else {                                                              \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    }                                                                     \
+    return;                                                               \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 };
 
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
-                                    fuse_relu_before_conv> {
+class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const framework::Tensor& output_grad,
@@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 
     framework::Tensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
-                                       filter.dims()[0], filter.dims()[1]});
+      framework::DDim filter_hwc_dims({filter.dims()[2],
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
           ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
       threads = dim3(std::min(input_channels, thread), blocks, 1);
       grid = dim3((input_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
 
@@ -1090,22 +1425,60 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
            c_filter == -1)) {                                             \
     if (data_layout != DataLayout::kNHWC) {                               \
       KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
-          output_channels, output_height, output_width, input_channels,   \
-          input_height, input_width, filter_multiplier, ksize_height,     \
-          ksize_width, stride_height, stride_width, padding_height,       \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          input_data,                                                     \
+          output_grad_data,                                               \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
     } else {                                                              \
       KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
-          output_channels, output_height, output_width, input_channels,   \
-          input_height, input_width, filter_multiplier, ksize_height,     \
-          ksize_width, stride_height, stride_width, padding_height,       \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          input_data,                                                     \
+          output_grad_data,                                               \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
     }                                                                     \
     return;                                                               \
   }
@@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 };
 
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
+class DepthwiseConvFilterGradFunctor<phi::GPUContext,
+                                     T,
                                      fuse_relu_before_conv> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output_grad,
                   const std::vector<int>& strides,
@@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
           std::max(block_size / output_channels, 1),
           ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
       grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
       threads = dim3(std::min(output_channels, block_size), blocks, 1);
     }
     int filter_multiplier = output_channels / input_channels;
@@ -1200,22 +1575,41 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
            c_filter == -1)) {                                                  \
     if (data_layout != DataLayout::kNHWC) {                                    \
       KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNCHW,                                                   \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
-          output_height, output_width, input_channels, input_height,           \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
-          stride_height, stride_width, padding_height, padding_width,          \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_grad_data,                                                    \
+          input_data,                                                          \
+          batch_size,                                                          \
+          output_channels,                                                     \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
     } else {                                                                   \
       framework::Tensor filter_grad_hwc;                                       \
       if (c_filter != -1) {                                                    \
-        framework::DDim filter_grad_hwc_dims(                                  \
-            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
-             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+        framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2],          \
+                                              filter_grad->dims()[3],          \
+                                              filter_grad->dims()[0],          \
+                                              filter_grad->dims()[1]});        \
         filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
         filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
-        phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;      \
+        phi::funcs::SetConstant<phi::GPUContext, T> set_zero;                  \
         set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
         filter_grad_data = filter_grad_hwc.data<T>();                          \
       } else {                                                                 \
@@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
         threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
       }                                                                        \
       KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNHWC,                                                   \
           fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
-          output_height, output_width, input_channels, input_height,           \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
-          stride_height, stride_width, padding_height, padding_width,          \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_grad_data,                                                    \
+          input_data,                                                          \
+          batch_size,                                                          \
+          output_channels,                                                     \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
-        phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;     \
+        phi::funcs::TransposeNormal<phi::GPUContext, T> trans;                 \
         trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
       }                                                                        \
     }                                                                          \
@@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
   }
 };
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             false>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, false>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
 
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
-                                             true>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
 
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float, true>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f27b6fde99ffa652c734d363ad7731f75b495f4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides_t,
+                             const std::vector<int>& paddings_t,
+                             const std::string& padding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations_t,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  const DenseTensor* output_grad = &out_grad;
+
+  if (!input_grad && !filter_grad) return;
+
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, input_grad, static_cast<T>(0));
+
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, true>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, false>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    }
+  }
+
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, false>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c50ceae33fc790a763e02bed9b6bed4879b2c547
--- /dev/null
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/operators/conv_op.h"
+
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  DenseTensor* output = out;
+  output->mutable_data<T>(dev_ctx.GetPlace());
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> paddings = paddings_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  if (channel_last) {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[output->dims().size() - 1] %
+            input.dims()[input.dims().size() - 1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[output->dims().size() - 1],
+            input.dims()[input.dims().size() - 1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input.dims()[1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[1],
+            input.dims()[1]));
+  }
+
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+
+  if (fuse_relu) {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, true>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  } else {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, false>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..65bf837e6cf8a330fbb744c994311d17a7cc6299
--- /dev/null
+++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+// Extract the diagonal of a matrix 'dout' to a matrix 'dx'
+template <typename T>
+__global__ void ExtractDiagonalKernel(const T* dout,
+                                      T* dx,
+                                      std::ptrdiff_t start,
+                                      std::ptrdiff_t dx_length,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < dx_length;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    dx[xStride * idx] = dout[outOffset];
+  }
+}
+
+// Paste a vector 'dout' to the diagonal of a matrix 'dx'
+template <typename T>
+__global__ void PasteDiagonalKernel(const T* dout,
+                                    T* dx,
+                                    std::ptrdiff_t start,
+                                    std::ptrdiff_t size,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    std::ptrdiff_t xOffset = start + sumStride * idx;
+    dx[xOffset] = dout[outStride * idx];
+  }
+}
+
+template <typename T, typename Context>
+void DiagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    int offset,
+                    DenseTensor* x_grad) {
+  T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+  auto* dout_data = out_grad.data<T>();
+  auto dx_dims = x_grad->dims();
+  auto dout_dims = out_grad.dims();
+
+  auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+    const int64_t block_size =
+        std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+    int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (size + block_size - 1) / block_size);
+    return std::tuple<int64_t, int64_t>{block_size, grid_size};
+  };
+
+  if (dx_dims.size() == 1) {
+    auto dx_length = dx_dims[0];
+    auto size = (offset > 0) ? dx_length + offset : dx_length - offset;
+    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+    if (size > 0) {
+      auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+      auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
+      auto start =
+          (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
+
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                                 std::get<0>(block_grid_size),
+                                 0,
+                                 dev_ctx.stream()>>>(
+          dout_data,
+          dx_data,
+          start,
+          dx_length,
+          dout_stride_0 + dout_stride_1,
+          dx_stride);
+    }
+  } else {
+    phi::funcs::SetConstant<Context, T> set_padding_value;
+    set_padding_value(dev_ctx, x_grad, static_cast<T>(0));
+
+    int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims);
+    int64_t size;
+    if (offset > 0) {
+      size = std::min(dx_dims[0], dx_dims[1] - offset);
+    } else {
+      size = std::min(dx_dims[0] + offset, dx_dims[1]);
+    }
+
+    if (size > 0) {
+      auto start = (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0);
+      auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
+      std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+      PasteDiagonalKernel<T><<<std::get<1>(block_grid_size),
+                               std::get<0>(block_grid_size),
+                               0,
+                               dev_ctx.stream()>>>(dout_data,
+                                                   dx_data,
+                                                   start,
+                                                   size,
+                                                   dx_stride_0 + dx_stride_1,
+                                                   dout_stride_0);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(diag_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagGradKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
index fc70639787173d84b69262245dbb0500aa179a90..95d3d3365d91be61013e2016d06334f0498d866a 100644
--- a/paddle/phi/kernels/gpu/diag_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -130,5 +130,12 @@ void DiagKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {}
+PD_REGISTER_KERNEL(diag,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagKernel,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c458f8cce3e0a1f9e74f018fe112555d6c0689e0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+#endif
diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
similarity index 51%
rename from paddle/fluid/operators/dist_op.cu
rename to paddle/phi/kernels/gpu/dist_kernel.cu
index 90674969e283f1cba816ad46802cdbf971bcc555..87e75e02754a8a39b617a0fdb28ffd8aa8c97ce3 100644
--- a/paddle/fluid/operators/dist_op.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,21 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-namespace ops = paddle::operators;
 #ifdef PADDLE_WITH_HIP
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>);
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {}
 #else
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94d4942a41878f06fbe7a0ffa6e8cc2c3f42f159
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto size = x_grad->numel();
+  paddle::operators::DropoutGradGPUKernelDriver<T>(
+      dev_ctx, mode, p, out_grad, mask, size, x_grad, is_test);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd1683ad0c7d8c9a91721a15ae0c24046aab414e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+  bool upscale_in_train = (mode == "upscale_in_train");
+  mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+
+  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                                 is_test,
+                                                 mode,
+                                                 dropout_prob,
+                                                 upscale_in_train,
+                                                 fix_seed,
+                                                 seed,
+                                                 x,
+                                                 seed_tensor.get_ptr(),
+                                                 mask,
+                                                 out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e33966055ea07d9b70227a5ed4760ad3b21e1a8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ff3b371b6a01a5f311f13d839a0d9abef9f5b68
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index b17196b6b11566927a02b81aa7447ba77c1884ff..e5432b5f9187ca25fb6ee48ee44101eb244d4ec6 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -14,12 +14,96 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
+template <typename T>
+void ReduceWrapper(const GPUContext &dev_ctx,
+                   int axis,
+                   DenseTensor *src,
+                   DenseTensor *dst) {
+  std::vector<int> reduce_dims =
+      funcs::GetReduceDim(dst->dims(), src->dims(), axis);
+  funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims);
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXAndYOut(const GPUContext &dev_ctx,
+                     const Place &place,
+                     int axis,
+                     std::vector<const DenseTensor *> ins,
+                     const DenseTensor &dout,
+                     DenseTensor *dx,
+                     DenseTensor *dy,
+                     Functor func) {
+  DenseTensor tmp_dx;
+  DenseTensor tmp_dy;
+  dev_ctx.Alloc<T>(dx);
+  dev_ctx.Alloc<T>(dy);
+  std::vector<DenseTensor *> outs;
+  if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) {
+    outs = {dx, dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, dy};
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    outs = {dx, &tmp_dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, &tmp_dy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T, decltype(func), 2>(
+      dev_ctx, ins, &outs, axis, func);
+
+  if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXOrYOut(const GPUContext &dev_ctx,
+                    const Place &place,
+                    int axis,
+                    std::vector<const DenseTensor *> ins,
+                    const DenseTensor &dout,
+                    DenseTensor *dxy,
+                    Functor func) {
+  DenseTensor tmp_dxy;
+  dev_ctx.Alloc<T>(dxy);
+
+  std::vector<DenseTensor *> outs;
+  if (dxy->dims() != dout.dims()) {
+    tmp_dxy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dxy);
+    outs = {&tmp_dxy};
+  } else {
+    outs = {dxy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  if (dxy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
+  }
+}
+
 /*
 ******************************
     Add Grad
@@ -83,9 +167,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
   // dy
@@ -98,9 +181,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
 }
@@ -196,9 +278,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
   // dy
@@ -217,9 +298,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
-          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims);
     }
   }
 }
@@ -243,4 +323,78 @@ void elementwise_sub_grad(const GPUContext &ctx,
       dx->mutable_data<T>(ctx.GetPlace()),
       dy->mutable_data<T>(ctx.GetPlace()));
 }
+/*
+******************************
+    Div Grad
+******************************
+*/
+template <typename T>
+void ElementwiseDivGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis = -1) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+/*
+******************************
+    Mul Grad
+******************************
+*/
+
+template <typename T>
+void ElementwiseMulGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis) {
+  const auto place = dev_ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y, &x};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MultiplyGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor<T>());
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index d00888aee67019befa99d49cd093a4e171f941c2..c4481bf6ce3c33ea260d774d0ac240a166856388 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -15,9 +15,11 @@
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
@@ -102,6 +104,50 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -168,3 +214,88 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(elementwise_fmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2cffc68fa0648937b96095f5bd58210adaf865b3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+PD_REGISTER_KERNEL(elementwise_fmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(elementwise_fmin,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseFMinKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
similarity index 50%
rename from paddle/fluid/operators/maxout_op.cu.cc
rename to paddle/phi/kernels/gpu/erf_grad_kernel.cu
index be1e81bb869a3a5144b72ef54af22f75b2146bc5..a06863b0a877714bc241d714429a8114f2b0d6f7 100644
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ b/paddle/phi/kernels/gpu/erf_grad_kernel.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/maxout_op.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erf_grad_kernel_impl.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
+PD_REGISTER_KERNEL(erf_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ErfGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/erf_kernel.cu b/paddle/phi/kernels/gpu/erf_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e741be3345fc1d487e1a4148d96f0430f5978f0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/erf_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/impl/erf_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    erf, GPU, ALL_LAYOUT, phi::ErfKernel, float, double, phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..273851cfd8b34917d8bea6eeafa8a70fe2ae5ba2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0972eebeabf1832d061160d5910d10292f2638ec
--- /dev/null
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(expand_as,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpandAsKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 1f756bfdbed30a18aebfabfb0810436406c87204..852d209ee018598285b3dff35ffd51e289f07976 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -63,18 +63,30 @@ void FullLikeKernel(const Context& dev_ctx,
   auto value = val.to<float>();
   using CommonType = typename std::common_type<
       float,
-      typename std::conditional<std::is_same<T, phi::dtype::float16>::value,
-                                float,
-                                T>::type>::type;
+      typename std::conditional<
+          std::is_same<T, phi::dtype::float16>::value ||
+              std::is_same<T, phi::dtype::bfloat16>::value,
+          float,
+          T>::type>::type;
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
@@ -110,6 +122,7 @@ PD_REGISTER_KERNEL(full,
                    int64_t,
                    bool,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -123,6 +136,7 @@ PD_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
diff --git a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5273902804a200bdf36e8e36748639758145e742
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdGradKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &out_grad,
+                        DenseTensor *x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto &place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s],"
+                                   "but desires to be [%s] or [%s].",
+                                   index_type,
+                                   phi::DataType::INT32,
+                                   phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterNdAdd<T, int>(ctx, out_grad, index, x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUScatterNdAdd<T, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..33745ef5f07e82387edfabf89d527b5b698fbabe
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_nd_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherNdKernel(const Context &ctx,
+                    const DenseTensor &x,
+                    const DenseTensor &index,
+                    DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  if (x.numel() == 0) return;
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUGatherNd<T, int>(ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUGatherNd<T, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_nd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherNdKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index a9e73ec37c8ed5f064144e27b06ac6304f5694b3..2906b81cb40096855fc990040f8d23b832f4da2e 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+
 #include <algorithm>
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gather_tree_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e2fe2190c1ce0cebea1326e70ce8572ff9bc4d88
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
+    MT out = dist(rng);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+      funcs::normal_distribution<MT> dist;
+      funcs::normal_transform<MT> trans(static_cast<MT>(mean),
+                                        static_cast<MT>(std));
+      funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func = GaussianGenerator<T>(static_cast<T>(mean),
+                                       static_cast<T>(std),
+                                       seed_offset.first,
+                                       gen_offset);
+      IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+    }
+  } else {
+    auto func =
+        GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eab521170bc5d8dde3c89102a45e84bd2f73974
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+struct GraphSendRecvSumCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMaxCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMinCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+__global__ void GraphSendRecvCUDAKernel(const T* params,
+                                        const IndexT* src_indices,
+                                        const IndexT* dst_indices,
+                                        T* output,
+                                        size_t index_size,
+                                        size_t slice_size,
+                                        Functor functor) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    functor(params, output, in_i, out_i);
+  }
+}
+
+// For max
+template <typename T>
+__global__ void InputResetMaxCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::min()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// For min
+template <typename T>
+__global__ void InputResetMinCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::max()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// Get dst_count
+template <typename T, typename IndexT>
+__global__ void ComputeCountCUDAKernel(int32_t* count,
+                                       const IndexT* dst_indices,
+                                       size_t index_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
+    IndexT dst_i = dst_indices[i];
+    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
+  }
+}
+
+// For forward mean
+template <typename T>
+__global__ void ManipulateMeanCUDAKernel(T* output,
+                                         int32_t* count,
+                                         size_t input_size,
+                                         size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    int64_t c_index = i / slice_size;
+    if (*(count + c_index) > 1) {
+      *(output + i) = *(output + i) / *(count + c_index);
+    }
+  }
+}
+
+// For backward mean
+template <typename T, typename IndexT>
+__global__ void ManipulateMeanGradCUDAKernel(const T* params,
+                                             const IndexT* src_indices,
+                                             const IndexT* dst_indices,
+                                             T* output,
+                                             size_t index_size,
+                                             size_t slice_size,
+                                             const int32_t* dst_count) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(output + out_i,
+                                    *(params + in_i) / dst_count[src_i]);
+  }
+}
+
+// For backward min and max
+template <typename T, typename IndexT>
+__global__ void ManipulateMinMaxGradCUDAKernel(const T* params,
+                                               const IndexT* src_indices,
+                                               const IndexT* dst_indices,
+                                               T* output,
+                                               size_t index_size,
+                                               size_t slice_size,
+                                               const T* ptr_input,
+                                               const T* ptr_output) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(
+        output + out_i,
+        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75692966b4662c12feaefdf5bfc7311ed9d628b8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpCUDAKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+
+#ifdef PADDLE_WITH_HIP
+  hipMemset(p_output, 0, memset_bytes);
+#else
+  cudaMemset(p_output, 0, memset_bytes);
+#endif
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = out_grad.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MEAN") {
+    const int32_t* s_count = dst_count->data<int32_t>();
+    ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
+  } else if (pool_type == "MAX" || pool_type == "MIN") {
+    const T* ptr_input = x->data<T>();
+    const T* ptr_output = out->data<T>();
+    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src,
+        d_index,
+        s_index,
+        p_output,
+        index_size,
+        slice_size,
+        ptr_input,
+        ptr_output);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fab306f831a6f401b858b0dfe11011463827a50b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
+                                           const DenseTensor& x,
+                                           const DenseTensor& src_index,
+                                           const DenseTensor& dst_index,
+                                           const std::string& pool_type,
+                                           DenseTensor* out,
+                                           DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  if (pool_type == "SUM" || pool_type == "MEAN") {
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_output, 0, memset_bytes);
+#else
+    cudaMemset(p_output, 0, memset_bytes);
+#endif
+  } else if (pool_type == "MAX") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::min());
+  } else if (pool_type == "MIN") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::max());
+  }
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = x.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMaxCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_max =
+        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
+    InputResetMaxCUDAKernel<T><<<grid_max, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMinCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_min =
+        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
+    InputResetMinCUDAKernel<T><<<grid_min, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MEAN") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    ctx.template Alloc<int32_t>(dst_count);
+    int32_t* p_dst_count = dst_count->data<int32_t>();
+
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#else
+    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
+#endif
+
+    int64_t grid_count = (index_size + block - 1) / block;
+    ComputeCountCUDAKernel<T, IndexT><<<grid_count, block, 0, ctx.stream()>>>(
+        p_dst_count, d_index, index_size);
+
+    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_mean =
+        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
+    ManipulateMeanCUDAKernel<T><<<grid_mean, block, 0, ctx.stream()>>>(
+        p_output, p_dst_count, input_size, slice_size);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..34774ec715c48de953945f94624c2c3cfe742d30
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b41ed1e55d39b537d89aa882ae10df1decbdd14
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ca53f021f054278b813fedb69db31f4d2c5aaf6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9388ac7071c3197c2a8774ffd5f0d670cdf8a8b2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..13ef2adaab3f32791e5c108b3f12b217e5dcea07
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KronGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a2124fd5af7d79cf6d1227a73105dd3e5b729547
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kron_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kron_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+PD_REGISTER_KERNEL(kron,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KronKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3e4cd21a658f103aca9bc611a2d42518245e4401
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e94d67f4ce324ad9d8237a377d70a920cdbd30af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+template <typename T>
+struct CudaLgammaFunctor {
+  __device__ __forceinline__ T operator()(const T x) const {
+    return Eigen::numext::lgamma(x);
+  }
+};
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  // XKTODO( add gpu kernel implementation. )
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = CudaLgammaFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, GPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a6ff365c11db8fa4940cacb5fc75c5ebe50ebbb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void LinspaceKernelInner(
+    T start, T stop, double step, int64_t size, T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
+  }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = static_cast<T>(start);
+}
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  DenseTensor n_start;
+  DenseTensor n_stop;
+  DenseTensor n_num;
+  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
+  T start_data = n_start.data<T>()[0];
+  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
+  T stop_data = n_stop.data<T>()[0];
+  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
+  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  double step = 0;
+  auto stream = ctx.stream();
+  int block = 512;
+  int grid = (num + block - 1) / block;
+  if (num != 1) {
+    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    LinspaceKernelInner<T><<<grid, block, 0, stream>>>(
+        start_data, stop_data, step, num, out_data);
+  } else {
+    LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start_data, out_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3bb256ad0326fdf1532fb3bf2162e0e64daba480
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0934520ea4ad14ce9450125b5fae7130140efa28
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25a9de8f8bed42121bcc6d52f4d2b2da565e4dfa
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d7ae7d8a3f745c001c7ca82a1baffc8c1f6291c3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9b889a9b4c0069efcbf38a10ce00f20072560a36
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out) {
+  DenseTensor atol_tensor;
+  if (use_default_tol) {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(0));
+  } else {
+    atol_tensor = phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(tol));
+  }
+  MatrixRankTolKernel<T, Context>(
+      dev_ctx, x, atol_tensor, use_default_tol, hermitian, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(matrix_rank,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixRankKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2009547fc8d6f18c488faab5fd57cc985990229b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -0,0 +1,433 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+namespace phi {
+
+template <typename T>
+void GesvdjBatched(const phi::GPUContext& dev_ctx,
+                   int batchSize,
+                   int m,
+                   int n,
+                   int k,
+                   T* A,
+                   T* U,
+                   T* V,
+                   T* S,
+                   int* info,
+                   int thin_UV = 1);
+
+template <typename T>
+void SyevjBatched(const phi::GPUContext& dev_ctx,
+                  int batchSize,
+                  int n,
+                  T* A,
+                  T* W,
+                  int* info);
+
+template <>
+void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int m,
+                          int n,
+                          int k,
+                          float* A,
+                          float* U,
+                          float* V,
+                          float* S,
+                          int* info,
+                          int thin_UV) {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnSgesvdj_bufferSize(handle,
+                                            jobz,
+                                            thin_UV,
+                                            m,
+                                            n,
+                                            A,
+                                            lda,
+                                            S,
+                                            U,
+                                            ldu,
+                                            V,
+                                            ldt,
+                                            &lwork,
+                                            gesvdj_params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSgesvdj(handle,
+                                                          jobz,
+                                                          thin_UV,
+                                                          m,
+                                                          n,
+                                                          A + stride_A * i,
+                                                          lda,
+                                                          S + k * i,
+                                                          U + stride_U * i,
+                                                          ldu,
+                                                          V + stride_V * i,
+                                                          ldt,
+                                                          workspace_ptr,
+                                                          lwork,
+                                                          info,
+                                                          gesvdj_params));
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
+                           int batchSize,
+                           int m,
+                           int n,
+                           int k,
+                           double* A,
+                           double* U,
+                           double* V,
+                           double* S,
+                           int* info,
+                           int thin_UV) {
+  // do not compute singular vectors
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  gesvdjInfo_t gesvdj_params = NULL;
+  int lda = m;
+  int ldu = m;
+  int ldt = n;
+  int lwork = 0;
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDgesvdj_bufferSize(handle,
+                                            jobz,
+                                            thin_UV,
+                                            m,
+                                            n,
+                                            A,
+                                            lda,
+                                            S,
+                                            U,
+                                            ldu,
+                                            V,
+                                            ldt,
+                                            &lwork,
+                                            gesvdj_params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  int stride_A = lda * n;
+  int stride_U = ldu * (thin_UV ? k : m);
+  int stride_V = ldt * (thin_UV ? k : n);
+  for (int i = 0; i < batchSize; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDgesvdj(handle,
+                                                          jobz,
+                                                          thin_UV,
+                                                          m,
+                                                          n,
+                                                          A + stride_A * i,
+                                                          lda,
+                                                          S + k * i,
+                                                          U + stride_U * i,
+                                                          ldu,
+                                                          V + stride_V * i,
+                                                          ldt,
+                                                          workspace_ptr,
+                                                          lwork,
+                                                          info,
+                                                          gesvdj_params));
+    // check the error info
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <>
+void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
+                         int batchSize,
+                         int n,
+                         float* A,
+                         float* W,
+                         int* info) {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  // matrix is saved as column-major in cusolver.
+  // numpy and torch use lower triangle to compute eigenvalues, so here use
+  // upper triangle
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle,
+                                                         jobz,
+                                                         uplo,
+                                                         n,
+                                                         A + stride_A * i,
+                                                         lda,
+                                                         W + n * i,
+                                                         workspace_ptr,
+                                                         lwork,
+                                                         info,
+                                                         params));
+
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]",
+            i,
+            error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <>
+void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
+                          int batchSize,
+                          int n,
+                          double* A,
+                          double* W,
+                          int* info) {
+  auto handle = dev_ctx.cusolver_dn_handle();
+  // Compute eigenvalues only
+  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
+  //  upper triangle of A is stored
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
+  int lda = n;
+  int stride_A = lda * n;
+  int lwork = 0;
+  syevjInfo_t params = NULL;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, &lwork, params));
+  auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+
+  for (int i = 0; i < batchSize; i++) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj(handle,
+                                                         jobz,
+                                                         uplo,
+                                                         n,
+                                                         A + stride_A * i,
+                                                         lda,
+                                                         W + n * i,
+                                                         workspace_ptr,
+                                                         lwork,
+                                                         info,
+                                                         params));
+    int error_info;
+    paddle::memory::Copy(phi::CPUPlace(),
+                         &error_info,
+                         dev_ctx.GetPlace(),
+                         info,
+                         sizeof(int),
+                         dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        error_info,
+        0,
+        phi::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver eigenvalues is not zero. [%d]",
+            i,
+            error_info));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params));
+}
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  dev_ctx.template Alloc<int64_t>(out);
+
+  auto dim_x = x.dims();
+  auto dim_out = out->dims();
+  int rows = dim_x[dim_x.size() - 2];
+  int cols = dim_x[dim_x.size() - 1];
+  int k = std::min(rows, cols);
+  auto numel = x.numel();
+  int batches = numel / (rows * cols);
+
+  T rtol_T = 0;
+  if (use_default_tol) {
+    rtol_T = std::numeric_limits<T>::epsilon() * std::max(rows, cols);
+  }
+
+  // Must Copy X once, because the gesvdj will destory the content when exit.
+  DenseTensor x_tmp;
+  paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp);
+  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches);
+  int* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+  DenseTensor eigenvalue_tensor;
+  eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k));
+  auto* eigenvalue_data = dev_ctx.template Alloc<T>(&eigenvalue_tensor);
+
+  if (hermitian) {
+    SyevjBatched<T>(
+        dev_ctx, batches, rows, x_tmp.data<T>(), eigenvalue_data, info_ptr);
+
+    phi::AbsKernel<T, Context>(dev_ctx, eigenvalue_tensor, &eigenvalue_tensor);
+
+  } else {
+    DenseTensor U, VH;
+    U.Resize(detail::GetUDDim(dim_x, k));
+    VH.Resize(detail::GetVHDDim(dim_x, k));
+    auto* u_data = dev_ctx.template Alloc<T>(&U);
+    auto* vh_data = dev_ctx.template Alloc<T>(&VH);
+    GesvdjBatched<T>(dev_ctx,
+                     batches,
+                     cols,
+                     rows,
+                     k,
+                     x_tmp.data<T>(),
+                     vh_data,
+                     u_data,
+                     eigenvalue_data,
+                     info_ptr,
+                     1);
+  }
+
+  DenseTensor max_eigenvalue_tensor;
+  dev_ctx.template Alloc<T>(&max_eigenvalue_tensor);
+  max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims()));
+
+  phi::MaxKernel<T, Context>(dev_ctx,
+                             eigenvalue_tensor,
+                             std::vector<int64_t>{-1},
+                             false,
+                             &max_eigenvalue_tensor);
+
+  DenseTensor temp_rtol_tensor;
+  temp_rtol_tensor =
+      phi::Full<T, Context>(dev_ctx, {1}, static_cast<T>(rtol_T));
+
+  DenseTensor rtol_tensor =
+      phi::Multiply<T>(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor);
+  DenseTensor tol_tensor;
+  tol_tensor.Resize(dim_out);
+  dev_ctx.template Alloc<T>(&tol_tensor);
+
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+      dev_ctx,
+      atol_tensor,
+      rtol_tensor,
+      -1,
+      GreaterElementFunctor<T>(),
+      &tol_tensor);
+
+  tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1));
+
+  DenseTensor compare_result;
+  compare_result.Resize(detail::NewAxisDim(dim_out, k));
+  dev_ctx.template Alloc<int64_t>(&compare_result);
+
+  int axis = -1;
+  funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int64_t>(
+      dev_ctx,
+      eigenvalue_tensor,
+      tol_tensor,
+      axis,
+      funcs::GreaterThanFunctor<T, int64_t>(),
+      &compare_result);
+
+  phi::SumKernel<int64_t>(dev_ctx,
+                          compare_result,
+                          std::vector<int64_t>{-1},
+                          compare_result.dtype(),
+                          false,
+                          out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(matrix_rank_tol,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixRankTolKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..86ff09fd74b065bed765f51c0f5019350ab04636
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..88776a49f19b2bf36d4dd4c5f6c610ce69a7f40a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6761d945e952eb0a4f6498f484a75766e913d74d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(multi_dot_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiDotGradKernel,
+                   float,
+                   double,
+                   float16) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60b1fce5ddd8905b15bcb3023e1512aedcf32585
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(
+    multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {}
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/phi/kernels/gpu/nll_loss.h
similarity index 50%
rename from paddle/fluid/operators/nll_loss_op.cu
rename to paddle/phi/kernels/gpu/nll_loss.h
index fd8a44cc05d7c89ec887ddc08c4f504a7629b5e2..a457264498feb2a9b3a1107ff086156a39f7fe4a 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -1,37 +1,39 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <thrust/functional.h>
 #include <algorithm>
 #include <functional>
 #include <string>
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
+namespace phi {
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 static const int NTHREADS = 32;
-
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                   kNumMaxinumNumBlocks);
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
+__global__ void GPUNLLLossForward1D_no_reduce(T* out_data,
+                                              const T* x_data,
                                               const int64_t* label_data,
                                               const T* weight_data,
                                               const int64_t batch_size,
@@ -51,11 +53,15 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossForward1D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t size_average,
+                                                const int64_t ignore_index) {
   __shared__ T sharedInputs[NTHREADS], sharedWeights[NTHREADS];
   sharedInputs[threadIdx.x] = 0;
   sharedWeights[threadIdx.x] = 0;
@@ -99,9 +105,11 @@ __global__ void GPUNLLLossForward1D_with_reduce(
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp, int N>
-__device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
+__device__ void reduceNValuesInBlock(T* smem,
+                                     T threadVals[N],
                                      const unsigned int numVals,
-                                     ReduceOp reduceOp, T init) {
+                                     ReduceOp reduceOp,
+                                     T init) {
   if (numVals == 0) {
 #pragma unroll
     for (int i = 0; i < N; ++i) {
@@ -175,18 +183,26 @@ __device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp>
-__device__ T reduceBlock(T* smem, const unsigned int numVals, T threadVal,
-                         ReduceOp reduceOp, T init) {
-  reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp,
-                                       init);
+__device__ T reduceBlock(T* smem,
+                         const unsigned int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  reduceNValuesInBlock<T, ReduceOp, 1>(
+      smem, &threadVal, numVals, reduceOp, init);
   return threadVal;
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_no_reduce(
-    T* out_data, const T* x_data, const int64_t* label_data,
-    const T* weight_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_no_reduce(T* out_data,
+                                              const T* x_data,
+                                              const int64_t* label_data,
+                                              const T* weight_data,
+                                              const int64_t batch_size,
+                                              const int64_t n_classes,
+                                              const int64_t in_dim2,
+                                              const int64_t in_dim3,
+                                              const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -211,11 +227,16 @@ __global__ void GPUNLLLossForward2D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t map_nelem,
+                                                const int64_t blocks_per_sample,
+                                                const int64_t ignore_index) {
   __shared__ T partial_sums[kNumCUDAThreads];
   int64_t i;
   T input_sum = 0;
@@ -228,7 +249,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   int64_t ioffset = sample * map_nelem * n_classes;
   int64_t step = blockDim.x * blocks_per_sample;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
@@ -242,8 +264,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   input_sum =
       reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<T>(), (T)0);
   __syncthreads();
-  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight,
-                           thrust::plus<T>(), (T)0);
+  acc_weight = reduceBlock(
+      partial_sums, blockDim.x, acc_weight, thrust::plus<T>(), (T)0);
 
   if (threadIdx.x == 0) {
     paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight);
@@ -258,12 +280,14 @@ __global__ void GPUNLLLossForward2D_size_average(T* out_data,
     *out_data /= *total_weight_data;
   }
 }
-
 template <typename T>
-__global__ void GPUNLLLossBackward1D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t ignore_index) {
   CUDA_KERNEL_LOOP(i, batch_size) {
     const int64_t cur_label = label_data[i];
     if (cur_label == ignore_index) {
@@ -275,11 +299,15 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward1D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_with_reduce(T* dx_data,
+                                                 const T* total_weight_data,
+                                                 const int64_t* label_data,
+                                                 const T* weight_data,
+                                                 const T* dout_data,
+                                                 const int64_t batch_size,
+                                                 const int64_t n_classes,
+                                                 const int64_t size_average,
+                                                 const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
   }
@@ -295,10 +323,15 @@ __global__ void GPUNLLLossBackward1D_with_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward2D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward2D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t in_dim2,
+                                               const int64_t in_dim3,
+                                               const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -319,10 +352,16 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
 
 template <typename T>
 __global__ void GPUNLLLossBackward2D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t size_average,
+    T* dx_data,
+    const T* total_weight_data,
+    const int64_t* label_data,
+    const T* weight_data,
+    const T* dout_data,
+    const int64_t batch_size,
+    const int64_t n_classes,
+    const int64_t map_nelem,
+    const int64_t blocks_per_sample,
+    const int64_t size_average,
     const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
@@ -334,7 +373,8 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   int toffset = sample * map_nelem;
   int ioffset = sample * map_nelem * n_classes;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       dx_data[ioffset + i + map_nelem * cur_label] =
@@ -343,158 +383,4 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   }
 }
 
-template <typename DeviceContext, typename T>
-class NLLLossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(total_weight_data, 0, sizeof(T));
-#else
-    cudaMemset(total_weight_data, 0, sizeof(T));
-#endif
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-    int64_t size_average = (int64_t)(reduction == "mean");
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossForward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossForward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, map_size, blocks_per_sample, ignore_index);
-        if (size_average) {
-          GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
-              out_data, total_weight_data);
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NLLLossGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-#ifdef PADDLE_WITH_HIP
-    hipMemset(dx_data, 0, dx->numel() * sizeof(T));
-#else
-    cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
-#endif
-
-    int64_t size_average = (int64_t)(reduction == "mean");
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossBackward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossBackward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, map_size, blocks_per_sample, size_average,
-            ignore_index);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9a2d9c6e479aa43b64b4d34bd2adc41da94a87ef
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& dout,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = dout.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+#ifdef PADDLE_WITH_HIP
+  hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#else
+  cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
+#endif
+
+  int64_t size_average = (int64_t)(reduction == "mean");
+  auto x_dims = x.dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossBackward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          dx_data,
+          total_weight_data,
+          label_data,
+          weight_data,
+          dout_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossBackward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                             total_weight_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             dout_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             size_average,
+                                                             ignore_index);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, GPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b0e1fef7ba9a5f1814ba996237d3fdc08882d3f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto* x = &input;
+  auto x_data = x->data<T>();
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  auto label_data = label.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(total_weight_data, 0, sizeof(T));
+#else
+  cudaMemset(total_weight_data, 0, sizeof(T));
+#endif
+  auto x_dims = x->dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+  int64_t size_average = (int64_t)(reduction == "mean");
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossForward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          out_data,
+          total_weight_data,
+          x_data,
+          label_data,
+          weight_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossForward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                             total_weight_data,
+                                                             x_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             ignore_index);
+      if (size_average) {
+        GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
+            out_data, total_weight_data);
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, GPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index ab38a82eceb1e73bddbe07a37d72cab99929852c..43a08b0603e652ab78ba6c28a8b57d238f878915 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -75,9 +75,9 @@ __global__ void NormalizeGradient(const T* x,
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& norm,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32c7fa1e85d150b99e7a05d169b01cd8727c1a98
--- /dev/null
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/one_hot_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data,
+                                 OutT* p_out_data,
+                                 const int64_t numel,
+                                 const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotV2OpCUDAFunctor {
+  const DenseTensor* in_;
+  DenseTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotV2OpCUDAFunctor(const DenseTensor* in,
+                        DenseTensor* out,
+                        int depth,
+                        const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    auto stream = ctx_.stream();
+    funcs::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS,
+                       0,
+                       stream>>>(p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
+  }
+
+  phi::VisitDataType(
+      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth, dev_ctx));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, GPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a25472d122b837fcc3928af15e0678f0362abf0c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(pad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b77a5f1aeb6cb3f24f274beb6939c480022fe49
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+#include "paddle/phi/kernels/pad_kernel.h"
+
+PD_REGISTER_KERNEL(pad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5ab6a1ccd49f2a88835bf1dd63c2d874db4e2a7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e8641395bef927b7e8f7c9ba522af84c0b34680e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pool2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool2dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool2d_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool2dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
+
+PD_REGISTER_KERNEL(pool3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pool3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(max_pool3d_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxPool3dWithIndexKernel,
+                   float,
+                   double) {
+  kernel->OutputAt(1).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6745653eba7d175447eb54c319919fd6d87fb5dd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(const int nthreads,
+                                     const T* input_rois,
+                                     const T* dout_data,
+                                     const float spatial_scale,
+                                     const int input_channels,
+                                     const int height,
+                                     const int width,
+                                     const int output_channels,
+                                     const int pooled_height,
+                                     const int pooled_width,
+                                     const int* rois_batch_id_data,
+                                     T* dx_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_dx_data = dx_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx) {
+  int rois_num_t = rois.dims()[0];
+  int input_channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (dx) {
+    // set roi batch id
+    DenseTensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num_t});
+    int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+    int rois_batch_size;
+    if (rois_num.get_ptr()) {
+      rois_batch_size = rois_num->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      paddle::memory::Copy(CPUPlace(),
+                           rois_num_list.data(),
+                           ctx.GetPlace(),
+                           rois_num->data<int>(),
+                           sizeof(int) * rois_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+        start += rois_num_list[n];
+      }
+    } else {
+      auto rois_lod = rois.lod().back();
+      rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    DenseTensor rois_batch_id_list_gpu;
+    Copy(ctx,
+         rois_batch_id_list,
+         ctx.GetPlace(),
+         false,
+         &rois_batch_id_list_gpu);
+
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, dx, static_cast<T>(0));
+
+    int dout_size = dout.numel();
+    int blocks = NumBlocks(dout_size);
+    int threads = kNumCUDAThreads;
+
+    if (dout_size > 0) {
+      GPUPSROIPoolBackward<T><<<blocks, threads, 0, ctx.stream()>>>(
+          dout_size,
+          rois.data<T>(),
+          dout.data<T>(),
+          spatial_scale,
+          input_channels,
+          height,
+          width,
+          output_channels,
+          pooled_height,
+          pooled_width,
+          rois_batch_id_list_gpu.data<int>(),
+          ctx.template Alloc<T>(dx));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool_grad, GPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f9be001ba763d323ad93fdfd4cc06e97e266188
--- /dev/null
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(const int nthreads,
+                                    const T* input_data,
+                                    const T* input_rois,
+                                    const float spatial_scale,
+                                    const int input_channels,
+                                    const int height,
+                                    const int width,
+                                    const int output_channels,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int* rois_batch_id_data,
+                                    T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      output_channels * pooled_height * pooled_width,
+      errors::InvalidArgument(
+          "The channels %d of input X should equal the product of "
+          "output_channels %d x pooled_height %d x pooled_width %d.",
+          input_channels,
+          output_channels,
+          pooled_height,
+          pooled_width));
+
+  int rois_num_t = rois.dims()[0];
+  if (rois_num_t == 0) return;
+  int rois_batch_size;
+  DenseTensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num_t});
+  int* rois_batch_id_data = ctx.template HostAlloc<int>(&rois_batch_id_list);
+
+  if (rois_num.get_ptr()) {
+    rois_batch_size = rois_num->numel();
+    auto* rois_num_data = rois_num->data<int>();
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(CPUPlace(),
+                         rois_num_list.data(),
+                         ctx.GetPlace(),
+                         rois_num_data,
+                         sizeof(int) * rois_batch_size,
+                         0);
+    int rois_num_count = 0;
+    for (int i = 0; i < rois_batch_size; ++i) {
+      rois_num_count += rois_num_list[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        rois_num_count,
+        rois_num_t,
+        errors::InvalidArgument(
+            "the rois_num from input and RoisNum must be the same"));
+    int start = 0;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (int i = start; i < start + rois_num_list[n]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+      start += rois_num_list[n];
+    }
+  } else {
+    auto rois_lod = rois.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(rois_batch_size,
+                      batch_size,
+                      errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size,
+                          batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_t,
+                      rois_num_with_lod,
+                      errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          rois_num_with_lod));
+
+    // set rois batch id
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+  }
+  DenseTensor rois_batch_id_list_gpu;
+  Copy(ctx, rois_batch_id_list, ctx.GetPlace(), false, &rois_batch_id_list_gpu);
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  // call cuda kernel function
+  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      rois.data<T>(),
+      spatial_scale,
+      input_channels,
+      height,
+      width,
+      output_channels,
+      pooled_height,
+      pooled_width,
+      rois_batch_id_list_gpu.data<int>(),
+      ctx.template Alloc<T>(out));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    psroi_pool, GPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) {
+  kernel->InputAt(2).SetDataType(
+      paddle::experimental::CppTypeToDataType<int>::Type());
+}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f553da361f1fe825a17962ed2ac5c9463b509f6b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int32_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    } else {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_gather_kernel<T, int32_t>(
+          out_grad,
+          axis,
+          index,
+          *value_grad,
+          dev_ctx);  // the gradient of scatter is gather
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d363c0c28364c065117fe53967234484871979af
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisCUDAKernel only runs on GPU device."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 9223a94c12aeb0912f634d1d5ca8b2c03653e8b9..da5315f34479f92bfb0e5d807e28882eafa3d2ac 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -17,1202 +17,14 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/utils/array.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-
-// Reduce split or not, Whether to use ReduceHigherDim
-#define REDUCE_SPLIT_BOUNDARY 512
-#define REDUCE_VEC_SIZE 4
-
-namespace kps = phi::kps;
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
-namespace kernels {
-
-namespace details {
-
-static inline int GetLastPow2(int n) {
-  n |= (n >> 1);
-  n |= (n >> 2);
-  n |= (n >> 4);
-  n |= (n >> 8);
-  n |= (n >> 16);
-  return std::max(1, n - (n >> 1));
-}
-
-static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
-
-// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
-static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
-                                             const std::vector<int>& idx) {
-  int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[idx[i + 1]];
-  }
-  return strides;
-}
-
-// get blockDim for reduceLastDim and reduceAny
-static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kps::details::kReduceMaxThread
-             ? kps::details::kReduceMaxThread
-             : GetLastPow2(block_dim);
-}
-
-// check reduce rand is valid
-static inline void CheckReduceRank(int reduce_rank, int rank) {
-  if (rank % 2 == 0) {
-    PADDLE_ENFORCE_EQ(reduce_rank,
-                      rank / 2,
-                      phi::errors::InvalidArgument(
-                          "ReduceOp: invalid reduce rank. When rank = %d, "
-                          "reduce_rank must be %d, but got %d.",
-                          rank,
-                          rank / 2,
-                          reduce_rank));
-  } else {
-    auto lower_rank = (rank - 1) / 2;
-    auto upper_rank = (rank + 1) / 2;
-    PADDLE_ENFORCE_EQ(
-        reduce_rank == lower_rank || reduce_rank == upper_rank,
-        true,
-        phi::errors::InvalidArgument(
-            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
-            "must be %d or %d, but got %d.",
-            rank,
-            lower_rank,
-            upper_rank,
-            reduce_rank));
-  }
-}
-
-// convert dims from vector to array
-template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline phi::Array<T, ElementCount> VectorToArray(
-    const VectorLikeType& vec) {
-  PADDLE_ENFORCE_LE(
-      vec.size(),
-      ElementCount,
-      phi::errors::InvalidArgument("Cub reduce Array: size not match. Received "
-                                   "vec.size() %d > ElementCount %d.",
-                                   vec.size(),
-                                   ElementCount));
-  size_t n = static_cast<size_t>(vec.size());
-  phi::Array<T, ElementCount> ret;
-  for (size_t i = 0; i < n; ++i) {
-    ret[i] = vec[i];
-  }
-  return ret;
-}
-
-static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
-                                            int dim_size,
-                                            bool reduce_all) {
-  std::vector<int> reduce_dims;
-  if (reduce_all) {
-    reduce_dims.resize(dim_size);
-    int reduce_size = reduce_dims.size();
-    for (int i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = i;
-    }
-  } else {
-    for (auto e : dims) {
-      PADDLE_ENFORCE_LT(e,
-                        dim_size,
-                        phi::errors::InvalidArgument(
-                            "ReduceOp: invalid axis, when x_dims is %d, "
-                            "axis[i] should less than x_dims, but got %d.",
-                            dim_size,
-                            e));
-      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
-    }
-  }
-  return reduce_dims;
-}
-
-}  // namespace details
-
-constexpr int kMaxRank = phi::DDim::kMaxRank;
-
-enum ReduceType {
-  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
-  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
-  kReduceAny = 0x03,        // when reduce_dim.size() > 1
-};
-
-struct IndexCalculator {
-  IndexCalculator(int dim,
-                  const std::vector<int>& cal_dims,
-                  const std::vector<int>& cal_strides,
-                  const std::vector<int>& full_strides)
-      : dim(dim) {
-    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
-    strides = details::VectorToArray<int, kMaxRank>(full_strides);
-    std::vector<paddle::platform::FastDivMod> cal_divmoders;
-    // fast divmod
-    for (auto i : cal_strides) {
-      cal_divmoders.push_back(paddle::platform::FastDivMod(i));
-    }
-    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
-        cal_divmoders);
-  }
-
-  __device__ inline int operator()(int offset) const {
-    int index = 0;
-#pragma unroll
-    for (int i = 0; i < kMaxRank; ++i) {
-      if (i == dim) {
-        break;
-      }
-      auto divmod = divmoders[i].Divmod(offset);
-      index += (divmod.val[0] * strides[dims[i]]);
-      offset = divmod.val[1];
-    }
-    return index;
-  }
-
-  int dim;
-  phi::Array<int, kMaxRank> dims;
-  phi::Array<int, kMaxRank> strides;
-  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
-};
-
-template <bool ReduceLastDim = false>
-struct ReduceIndexMapping {
-  const kps::DimConfig dim;
-  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
-      : dim(dims) {}
-
-  __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    } else {
-      return cluster_id() % dim.split_num_x;
-    }
-#else
-    return blockIdx.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() % dim.split_num_x);
-    } else {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    }
-#else
-    return blockIdx.y;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_y;
-#else
-    return blockDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_y;
-    } else {
-      return dim.split_num_x;
-    }
-#else
-    return gridDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_x;
-    } else {
-      return dim.split_num_y;
-    }
-#else
-    return gridDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.deal_size_y;
-    } else {
-      return dim.deal_size_x;
-    }
-#else
-    return 1;
-#endif
-  }
-};
-
-// when reduce_type == kReduceLastDim this struct will be used
-// for higher performance
-struct OneDimIndexCal {
-  explicit OneDimIndexCal(int num) : stride(num) {}
-
-  __device__ inline int operator()(int index) const { return index * stride; }
-  int stride;
-};
-
-// reduce config
-template <typename Ty>
-struct ReduceConfig {
-  ReduceConfig(const std::vector<int>& origin_reduce_dims,
-               const std::vector<int>& origin_x_dim)
-      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
-
-  // get the parameters of reduceKernel
-  void Run() {
-    // step1: update the reduce_dim left_dim and x_dim
-    SetReduceDim();
-
-    // step2: get the strides of dim for reduceAny and reduceLastDim
-    SetStrides();
-
-    // step3: get the type of reduce
-    SetReduceType();
-
-    // step4: set the block and grid for launch kernel
-    SetBlockDim();
-  }
-
-  // when should_reduce_again is true, we need malloc temp space for temp data
-  void SetOutputData(Ty* y_data,
-                     const paddle::platform::Place& place,
-                     phi::DenseTensor* tmp) {
-    if (should_reduce_again) {
-      tmp->ResizeAndAllocate(phi::make_ddim(
-          {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
-      output_data = tmp->mutable_data<Ty>(place);
-    } else {
-      output_data = y_data;
-    }
-  }
-
- private:
-  // set reduce_dim, left_dim and update x_dim
-  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
-  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
-  void SetReduceDim() {
-    std::set<int> reduce_set;
-    for (auto e : reduce_dims_origin) {
-      auto pos = e >= 0 ? e : e + x_dim.size();
-      reduce_set.insert(pos);
-    }
-
-    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
-    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
-
-    // update reduce_dim and x_dim
-    std::vector<int> x_new_dim;
-
-    reduce_dim.push_back(reduce_dim_temp[0]);
-    x_new_dim.push_back(x_dim[0]);
-
-    int idx_reduce = 1;
-    int num = 0;
-
-    if (reduce_dim_temp.size() > 1) {
-      for (int i = 1; i < x_dim.size(); i++) {
-        if ((idx_reduce < reduce_dim_temp.size()) &&
-            (i == reduce_dim_temp[idx_reduce])) {
-          int result =
-              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
-          bool is_equal = ((result - num) == 1);
-          if (is_equal) {
-            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-            num++;
-          } else {
-            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
-            x_new_dim.push_back(x_dim[i]);
-          }
-          idx_reduce++;
-        } else {
-          x_new_dim.push_back(x_dim[i]);
-        }
-      }
-    } else {
-      x_new_dim = x_dim;
-    }
-
-    // update x_dim
-    x_dim = x_new_dim;
-    std::vector<int>().swap(x_new_dim);
-
-    std::vector<int> reduce_dim_new;
-    int is_reduced = 0;
-    for (auto e : reduce_dim) {
-      is_reduced |= 1 << e;
-    }
-
-    std::vector<int>().swap(reduce_dim);
-
-    for (int i = 0; i < x_dim.size(); i++) {
-      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-        x_new_dim.push_back(x_dim[i]);
-        if ((is_reduced >> i) & 1)
-          reduce_dim_new.push_back(x_new_dim.size() - 1);
-      } else {
-        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-      }
-    }
-
-    x_dim = x_new_dim;
-    reduce_dim = reduce_dim_new;
-
-    int x_rank = static_cast<int>(x_dim.size());
-    std::set<int> left_set;
-
-    for (int i = 0; i < x_rank; ++i) {
-      left_set.insert(i);
-    }
-
-    for (auto e : reduce_dim) {
-      left_set.erase(e);
-    }
-
-    left_dim.assign(left_set.begin(), left_set.end());
-
-    // if the last dim gets involved in reduction
-    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
-  }
-
-  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
-  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
-  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
-  //     left_strides = [1]
-  void SetStrides() {
-    std::vector<int> idx_dim;
-    for (int i = 0; i < x_dim.size(); i++) {
-      idx_dim.push_back(i);
-    }
-
-    x_strides = details::GetDimStrides(x_dim, idx_dim);
-    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
-    left_strides = details::GetDimStrides(x_dim, left_dim);
-    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
-
-    left_num = 1;
-    if (left_dim.size()) {
-      left_num = left_strides[0] * x_dim[left_dim[0]];
-    }
-  }
-
-  // get the reduceType
-  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
-  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
-  //     x_dim = [8] reduce_dim = [0] --> reduceAll
-  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
-  void SetReduceType() {
-    int rank = x_dim.size();
-    int reduce_rank = reduce_dim.size();
-    bool is_last_dim =
-        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
-    if (rank == reduce_rank || is_last_dim) {
-      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
-    } else if (reduce_rank == 1) {
-// ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU2
-      if (reduce_dim[0] == 0) {
-        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-      } else {
-        reduce_type = static_cast<int>(ReduceType::kReduceAny);
-      }
-#else
-      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-#endif
-    } else {
-      reduce_type = static_cast<int>(ReduceType::kReduceAny);
-    }
-  }
-
-  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
-    constexpr int min_reduce_num_per_thread = 16;
-    constexpr int max_reduce_num_per_thread = 256;
-    constexpr int max_num_threads = kps::details::kReduceMaxThread;
-
-    // set block size.
-    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
-    //    will process the reduction for one output.
-    //    The number of output for one block is blockDim.y;
-    // 2. If reduce_last_dim == false, different threadIdx.x will process
-    //    different reduction and gets the output separately. If it is
-    //    necessary, it should reduce in block y.
-    //    The number of output for one block is blockDim.x;
-    int block_x, block_y;
-    int grid_num, reduce_num_per_thread;
-    if (reduce_last_dim) {
-      block_x = details::GetBlockDim(reduce_num);
-      block_y = details::GetBlockDim(left_num);
-      block_dim->x = block_x;
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      grid_num = details::AlignUp(left_num, block_dim->y);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
-    } else {
-      block_x = details::GetBlockDim(left_num);
-      block_y = details::GetBlockDim(reduce_num);
-      block_dim->x = std::min(block_x, 32);
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      block_dim->x =
-          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
-      grid_num = details::AlignUp(left_num, block_dim->x);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
-    }
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    int num_threads = block_dim->x * block_dim->y;
-    int max_num_blocks = max_threads / num_threads;
-
-    // set grid size.
-    // Whether to set grid.y larger than 1, there are 3 following rules:
-    // 1. The number that each thread process should no less than
-    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
-    // 2. It should maximize the utilization of SM.
-    // So we choose the minimum between input_split_num_1 and input_split_num_3
-    // to make each thread process as mush data as possible. Meanwhile,
-    // the number cannot be larger than max_reduce_num_per_thread, so we
-    // choose the maximum between the result above and input_split_num_2.
-    int input_split_num_1 =
-        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
-    int input_split_num_2 =
-        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
-    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
-
-    grid_dim->x = grid_num;
-    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
-                           input_split_num_2);
-    // if grid.y > 1, we need launch reduce kernel again.
-    if (grid_dim->y > 1) {
-      should_reduce_again = true;
-    }
-  }
-
-  // set block and grid for launch kernel
-  // for ReduceHigherDim: if block is enough -> splite reduce_num
-  //                     else init block(32, 1) grid(block_num, 1)
-  // for others: block(block_num, 1) , grid(left_num, 1)
-  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
-    int last_dim_num = x_dim.back();
-    // update left_num
-    int grid_z = left_num / last_dim_num;
-    left_num = last_dim_num;
-    grid_dim->z = grid_z;
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    // init
-    int num_block = (max_threads / left_num);
-    block_dim->x = details::GetBlockDim(left_num);
-    grid_dim->x = details::AlignUp(left_num, block_dim->x);
-    blocking_size = reduce_num;
-
-    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-      blocking_size = details::GetLastPow2(reduce_num / num_block);
-      if (blocking_size <= 1) {
-        blocking_size = details::GetLastPow2(sqrt(reduce_num));
-      } else if (blocking_size * 2 < reduce_num) {
-        blocking_size *= 2;
-      }
-      should_reduce_again = true;
-      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
-    }
-  }
-
-  void SetBlockDim() {
-    // init
-    int block_num = details::GetBlockDim(reduce_num);
-    should_reduce_again = false;
-    dim3 block_dim(block_num, 1, 1);
-    dim3 grid_dim(left_num, 1, 1);
-    blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU2
-    if (reduce_last_dim) {
-      block_dim.x = 128;
-      block_dim.y = reduce_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    } else {
-      block_dim.x = 128;
-      block_dim.y = left_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    }
-#else
-    if (reduce_type == ReduceType::kReduceHigherDim) {
-      SetBlockDimForHigher(&block_dim, &grid_dim);
-    } else {
-      SetBlockDimForReduceAny(&block_dim, &grid_dim);
-    }
-#endif
-
-    block = block_dim;
-    grid = grid_dim;
-  }
-
- public:
-  std::vector<int> reduce_dims_origin;
-  std::vector<int> reduce_dim;
-  std::vector<int> x_dim;
-  std::vector<int> left_dim;
-  std::vector<int> x_strides;
-  std::vector<int> left_strides;
-  std::vector<int> reduce_strides;
-
-  int reduce_type;
-  int reduce_num;
-  int left_num;
-  int blocking_size;
-  bool should_reduce_again;
-  bool reduce_last_dim;
-
-  Ty* output_data;
-
-  dim3 block;
-  dim3 grid;
-};
-
-// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-// function will be used
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp,
-          typename Calculator>
-__global__ void ReduceAnyKernel(const Tx* x,
-                                Ty* y,
-                                ReduceOp reducer,
-                                TransformOp transformer,
-                                MPType init,
-                                int reduce_num,
-                                int left_num,
-                                bool reduce_last_dim,
-                                const Calculator reduce_index_calculator,
-                                const Calculator left_index_calculator,
-                                const kps::DimConfig dim) {
-  int input_idx, left_idx, stride;
-  int block_size = 0;
-  bool need_store = true;
-  int loop_left = 0;
-  int tid = 0;
-  // the last dim gets involved in reduction
-  int store_offset = 0;
-  int stride_left = 0;
-  if (reduce_last_dim) {
-    auto block = ReduceIndexMapping<true>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimX();
-    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
-    stride = block.GridDimY() * block.BlockDimX();
-    block_size = block.BlockDimX();
-    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = 1;
-    tid = threadIdx.x;
-  } else {
-    auto block = ReduceIndexMapping<false>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimY();
-    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
-    stride = block.GridDimY() * block.BlockDimY();
-    block_size = block.BlockDimY();
-    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = block.BlockDimX() * block.GridDimX();
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = threadIdx.y;
-  }
-  // calculate the offset, means the addr where each thread really start.
-  // 1. reduce for each thread
-  MPType input_compute[REDUCE_VEC_SIZE];
-  Tx input_reg[REDUCE_VEC_SIZE];
-  for (int i = 0; i < loop_left; i += stride_left) {
-    int input_offset = left_index_calculator(left_idx + i);
-    const Tx* input = x + input_offset;
-    MPType reduce_var = init;
-    // load REDUCE_VEC_SIZE data once, and then compute
-    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
-    for (; input_idx + block_size < bound;
-         input_idx += REDUCE_VEC_SIZE * stride) {
-      kps::ReadDataReduce<Tx,
-                          Tx,
-                          1,
-                          REDUCE_VEC_SIZE,
-                          1,
-                          1,
-                          Calculator,
-                          kps::IdentityFunctor<Tx>,
-                          false>(&input_reg[0],
-                                 input,
-                                 input_idx,
-                                 reduce_index_calculator,
-                                 1,
-                                 reduce_num,
-                                 1,
-                                 stride,
-                                 kps::IdentityFunctor<Tx>(),
-                                 reduce_last_dim);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &input_compute[0], &input_reg[0], transformer);
-      kps::Reduce<MPType,
-                  REDUCE_VEC_SIZE,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-    }
-
-    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
-    kps::ReadDataReduce<Tx,
-                        MPType,
-                        1,
-                        REDUCE_VEC_SIZE,
-                        1,
-                        1,
-                        Calculator,
-                        TransformOp,
-                        true>(&input_compute[0],
-                              input,
-                              input_idx,
-                              reduce_index_calculator,
-                              1,
-                              reduce_num - input_idx,
-                              1,
-                              stride,
-                              transformer,
-                              reduce_last_dim);
-    kps::Reduce<MPType,
-                REDUCE_VEC_SIZE,
-                1,
-                1,
-                ReduceOp,
-                kps::details::ReduceMode::kLocalMode>(
-        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-
-    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
-        &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp>
-__global__ void ReduceHigherDimKernel(const Tx* x,
-                                      Ty* y,
-                                      ReduceOp reducer,
-                                      TransformOp transformer,
-                                      MPType init,
-                                      int reduce_num,
-                                      int left_num,
-                                      int blocking_size,
-                                      const kps::DimConfig dim) {
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  auto block = ReduceIndexMapping<false>(dim);
-  int idy = block.BlockIdY() * blocking_size;
-  int idx = block.BlockIdX() * block.BlockDimX();
-  int idz = BLOCK_ID_Z * left_num;
-  int stride = dim.split_num_x * dim.deal_size_x;
-  int size = left_num - dim.rem_x;
-  int loop_size = min(reduce_num - idy, blocking_size);
-  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
-  int block_offset = idy * left_num + idz * reduce_num;
-  const Tx* input = x + block_offset;
-  Tx reduce_input;
-  for (; idx < size; idx += stride) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
-                                            input + loop_idx * left_num + idx,
-                                            block.BlockDimX(),
-                                            1,
-                                            1,
-                                            left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, false>(
-        y + store_offset + idx, &result, block.BlockDimX());
-  }
-
-  if (idx < left_num) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
-                                           input + loop_idx * left_num + idx,
-                                           dim.rem_x,
-                                           1,
-                                           1,
-                                           left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, true>(
-        y + store_offset + idx, &result, dim.rem_x);
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          typename MPType,
-          typename ReduceOp,
-          typename TransformOp>
-static void LaunchReduceKernel(const Tx* x_data,
-                               Ty* y_data,
-                               const ReduceOp& reducer,
-                               const TransformOp& transform,
-                               MPType init,
-                               gpuStream_t stream,
-                               ReduceConfig<Ty> config) {
-  if (config.reduce_type == kReduceLastDim) {
-    int stride_reduce = 1;
-    int stride_left = config.reduce_num;
-    // for higher performance
-    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
-    auto left_index_calculator = OneDimIndexCal(stride_left);
-
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.block.y,
-                                        0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 128, stream>>>(x_data,
-                                                        config.output_data,
-                                                        reducer,
-                                                        transform,
-                                                        init,
-                                                        config.reduce_num,
-                                                        config.left_num,
-                                                        config.reduce_last_dim,
-                                                        reduce_index_calculator,
-                                                        left_index_calculator,
-                                                        dim);
-#else
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#endif
-
-  } else {
-    int reduce_rank = config.reduce_strides.size();
-    int left_rank = config.left_strides.size();
-    auto reduce_index_calculator = IndexCalculator(reduce_rank,
-                                                   config.reduce_dim,
-                                                   config.reduce_strides,
-                                                   config.x_strides);
-    auto left_index_calculator = IndexCalculator(
-        left_rank, config.left_dim, config.left_strides, config.x_strides);
-
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.block.y,
-                                        0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 128, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#else
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
-#endif
-  }
-
-  if (config.should_reduce_again) {
-    dim3 block;
-    dim3 grid;
-    if (config.reduce_last_dim) {
-      block = dim3(32, 1, 1);
-      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
-    } else {
-      block = dim3(config.block.x, 1, 1);
-      grid = dim3(config.grid.x, 1, config.grid.z);
-    }
-
-    auto last_index = OneDimIndexCal(1);
-    auto first_index = OneDimIndexCal(config.left_num);
-    kps::DimConfig dim =
-        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-    dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Ty,
-                          Ty,
-                          MPType,
-                          ReduceOp,
-                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#endif
-  }
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
-                               void>::type
-CubTensorReduceImpl(const Tx* x_data,
-                    Ty* y_data,
-                    const TransformOp& transform,
-                    int reduce_num,
-                    const paddle::platform::Place& place,
-                    gpuStream_t stream) {
-  auto reducer = ReduceOp<Ty>();
-  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
-                                                                  transform);
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Reduce(nullptr,
-                            temp_storage_bytes,
-                            trans_x,
-                            y_data,
-                            reduce_num,
-                            reducer,
-                            reducer.initial(),
-                            stream);
-
-  phi::DenseTensor tmp = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      phi::DenseTensorMeta(
-          phi::DataType::UINT8,
-          phi::make_ddim({static_cast<int64_t>(temp_storage_bytes)})));
-
-  auto* temp_storage = tmp.mutable_data<uint8_t>(place);
-
-  cub::DeviceReduce::Reduce(temp_storage,
-                            temp_storage_bytes,
-                            trans_x,
-                            y_data,
-                            reduce_num,
-                            reducer,
-                            reducer.initial(),
-                            stream);
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
-                               void>::type
-CubTensorReduceImpl(const Tx* x_data,
-                    Ty* y_data,
-                    const TransformOp& transform,
-                    int reduce_num,
-                    const paddle::platform::Place& place,
-                    gpuStream_t stream) {
-  PADDLE_THROW(phi::errors::InvalidArgument(
-      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
-}
-
-template <typename Tx,
-          typename Ty,
-          template <typename> class ReduceOp,
-          typename TransformOp>
-void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const phi::DenseTensor& x,
-                      phi::DenseTensor* y,
-                      const TransformOp& transform,
-                      const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream) {
-  y->mutable_data<Ty>(x.place());
-
-  auto x_dim = phi::vectorize<int>(x.dims());
-  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
-  config.Run();
-  int numel = x.numel();
-  // after config.run()
-  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
-  // temp_output should be stored temp_data in output_data space or stored in
-  // y_data;
-
-  phi::DDim tmp_ddim;
-  phi::DenseTensor tmp = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(y->place()),
-      phi::DenseTensorMeta(y->dtype(), tmp_ddim, y->layout()));
-
-  auto x_data = x.data<Tx>();
-  auto y_data = y->data<Ty>();
-
-  if (config.reduce_num == 1) {
-    std::vector<const DenseTensor*> inputs = {&x};
-    std::vector<DenseTensor*> outputs = {y};
-    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
-    return;
-  }
-
-  config.SetOutputData(y_data, x.place(), &tmp);
-  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
-  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
-  if (use_cub_reduce) {
-    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
-        x_data, y_data, transform, config.reduce_num, x.place(), stream);
-    return;
-  }
-
-  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
-  auto reducer = ReduceOp<MPType>();
-  // launch ReduceHigherDimKernel
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
-  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
-  //     32
-  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
-  if (config.reduce_type == ReduceType::kReduceHigherDim) {
-    kps::DimConfig dim = kps::DimConfig(config.grid.x,
-                                        config.grid.y,
-                                        config.grid.z,
-                                        config.block.x,
-                                        config.blocking_size,
-                                        0);
-    dim.SetRem(config.left_num % config.block.x,
-               config.reduce_num % config.blocking_size,
-               0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Tx,
-                          Ty,
-                          MPType,
-                          ReduceOp<MPType>,
-                          TransformOp><<<8, 128, stream>>>(x_data,
-                                                           config.output_data,
-                                                           reducer,
-                                                           transform,
-                                                           reducer.initial(),
-                                                           config.reduce_num,
-                                                           config.left_num,
-                                                           config.blocking_size,
-                                                           dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
-
-    if (config.should_reduce_again) {
-      dim3 block = dim3(config.block.x, 1, 1);
-      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
-      kps::DimConfig dim2 =
-          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-      dim2.SetRem(config.left_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#endif
-    }
-    return;
-  }
-
-  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-  // function will be used
-  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
-      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
-}
-
-}  // namespace kernels
 
 template <typename T,
           template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
-void Reduce(const GPUContext& dev_ctx,
+void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
             const std::vector<int64_t>& dims,
@@ -1220,15 +32,13 @@ void Reduce(const GPUContext& dev_ctx,
             DataType out_dtype,
             DenseTensor* out) {
   std::vector<int> reduce_dims =
-      phi::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all);
+      phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);
 
   int reduce_num = 1;
   for (auto i : reduce_dims) {
     reduce_num *= (x.dims())[i];
   }
 
-  gpuStream_t stream = dev_ctx.stream();
-
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
     PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(
@@ -1236,29 +46,23 @@ void Reduce(const GPUContext& dev_ctx,
         phi::DataType::INT64,
         phi::DataType::FLOAT16,
         out_dtype,
-        "TensorReduceImpl",
+        "ReduceKernel",
         ([&] {
           using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
-          phi::kernels::TensorReduceImpl<data_t,
-                                         data_t,
-                                         ReduceOp,
-                                         TransformOp<data_t, MPType>>(
+          phi::funcs::ReduceKernel<data_t,
+                                   data_t,
+                                   ReduceOp,
+                                   TransformOp<data_t, MPType>>(
               dev_ctx,
               tmp_tensor,
               out,
               TransformOp<data_t, MPType>(reduce_num),
-              reduce_dims,
-              stream);
+              reduce_dims);
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::kernels::TensorReduceImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-        dev_ctx,
-        x,
-        out,
-        TransformOp<T, MPType>(reduce_num),
-        reduce_dims,
-        stream);
+    phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
+        dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_all_kernel.cu b/paddle/phi/kernels/gpu/reduce_all_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2963d3f206c2d7737e1ca13c91f69ae94a6a6f77
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_all_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_all_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..39c8cbe442cbd33db5da3c4311abd68641aafcd7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_any_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index a2b1c8631c7b44fefff5871515b77d9a67d992e2..d21c8a3fa46f81c046c722db50ac62fb57cf64f4 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -23,6 +23,7 @@
 #include <set>
 #include <vector>
 
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..98c3986c51dd6829287f5316ae9eb52f328372ab
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_min_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba37d54895d0d079a4153775ad80314be5a043ba
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_min_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_min_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
index 14084d0f4f3c6fbd4edeb335e15704ce2b4e6e15..278d4a6e5ab79a7519e1052a2d05c6ecda62692f 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9f4ddc3cf37a744355f6f79b7cd18b3d06b80062
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  auto pt_out_dtype = in_dtype;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+
+  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      kps::IdentityFunctor<T, MPType>(reduce_num));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2f906fa4f663b6da65a3e986af2214dfb49f2ec0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -0,0 +1,255 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <class T>
+__device__ T BilinearInterpolate(
+    const T* input_data, const int height, const int width, T y, T x) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <class T>
+__global__ void GPUROIAlignForward(const int nthreads,
+                                   const T* input_data,
+                                   const T* input_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   const int sampling_ratio,
+                                   int* roi_batch_id_data,
+                                   T* output_data,
+                                   const bool continuous_coordinate) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+    output_data[i] = output_val;
+  }
+}
+
+template <typename T, typename Context>
+void ROIAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  auto in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+#ifdef WITH_NV_JETSON
+  backends::gpu::ChangeThreadNum(dev_ctx, &threads, 256);
+#endif
+  DenseTensor roi_batch_id_list;
+  roi_batch_id_list.Resize({rois_num});
+  int* roi_batch_id_data = dev_ctx.template HostAlloc<int>(&roi_batch_id_list);
+  auto cplace = phi::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The boxes_batch_size and imgs "
+            "batch_size must be the same. But received boxes_batch_size = %d, "
+            "batch_size = %d",
+            boxes_batch_size,
+            batch_size));
+
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(cplace,
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(lod.empty(),
+                      false,
+                      errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
+                                              "not contain LoD information."));
+    auto boxes_lod = lod.back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and batch size "
+            "of images must be the same. But received rois batch size = %d, "
+            "and images batch size = %d",
+            boxes_batch_size,
+            batch_size));
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        boxes_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+  }
+  int bytes = roi_batch_id_list.numel() * sizeof(int);
+  auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
+  GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      roi_id_data,
+      dev_ctx.template Alloc<T>(out),
+      aligned);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 930c50a24be8fae40535c2d5e6dbbe85e7ced990..6f96a697b2f2db6c2097640f34c30142939f80e0 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/scale_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/phi/common/float16.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/scatter_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75506e2a0a17b269cb1327c01c1c4f3825eadb37
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s]",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (x_grad) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterGradForX<T, int32_t>(ctx, index, x_grad);
+    } else {
+      phi::funcs::GPUScatterGradForX<T, int64_t>(ctx, index, x_grad);
+    }
+  }
+
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGather<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::GPUGather<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_kernel.cu b/paddle/phi/kernels/gpu/scatter_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..811eae1bc028ec5484d173d1b4373111546d73b4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_kernel.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out) {
+  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  // use template class to support int32_t and int64_t
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "scatter_op Index holds the wrong type, it holds [%s],"
+                        "but desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterAssign<T, int32_t>(
+        ctx, updates, index, out, overwrite);
+  } else {
+    phi::funcs::GPUScatterAssign<T, int64_t>(
+        ctx, updates, index, out, overwrite);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71924befe8cf9383c523eee059c3efa79dd6a262
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad) {
+  if (x_grad) {
+    Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  }
+  if (updates_grad) {
+    ctx.template Alloc<T>(updates_grad);
+    // Gradient by Gather
+    const auto &index_type = index.dtype();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGatherNd<T, int32_t>(ctx, out_grad, index, updates_grad);
+    } else {
+      phi::funcs::GPUGatherNd<T, int64_t>(ctx, out_grad, index, updates_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eadd91773c00810e3f4187d079926028733a4945
--- /dev/null
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out) {
+  Copy(ctx, x, ctx.GetPlace(), true, out);
+  const auto &index_type = index.dtype();
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Index holds the wrong type, it holds [%s], but "
+                        "desires to be [%s] or [%s].",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUScatterNdAdd<T, int32_t>(ctx, updates, index, out);
+  } else {
+    phi::funcs::GPUScatterNdAdd<T, int64_t>(ctx, updates, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(scatter_nd_add,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ScatterNdAddKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a2ce2241c22dc5c1cab391fe24a502ba845802b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/searchsorted_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
+
+PD_REGISTER_KERNEL(searchsorted,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SearchsortedKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d9618dc159a6d3f5b24bdfcfdb219ec649e051f9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c38e935adf837ef00c48fa31bc1e37eea2948673
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7eed96699e720870577c3d5246ce07c12c37335c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f788da010b6827d18ea455bad57d775da4049acf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/gpu/shape_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..39b6eaeaef2a8e80d204941dc1f3ac92907aa786
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shape_kernel.cu
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/shape_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+PD_REGISTER_KERNEL(shape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0bd7b93f68928ae268feb2aad9c2aa0304ca4790
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shard_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void ShardIndexInner(const T* in_data,
+                                T* out_data,
+                                const int64_t numel,
+                                const int index_num,
+                                const int nshards,
+                                const int shard_id,
+                                const int ignore_value) {
+  int shard_size = (index_num + nshards - 1) / nshards;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
+    if (in_data[idx] / shard_size == shard_id) {
+      out_data[idx] = in_data[idx] % shard_size;
+    } else {
+      out_data[idx] = ignore_value;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out) {
+  PADDLE_ENFORCE_GT(
+      index_num,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'index_num' for Op(shard_index) must be greater than 0, "
+          "but the value given is %d.",
+          index_num));
+  PADDLE_ENFORCE_GT(nshards,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The value 'nshard' for Op(shard_index) must be "
+                        "greater than 0, but the value given is %d.",
+                        nshards));
+  PADDLE_ENFORCE_GE(
+      shard_id,
+      0,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be greater or "
+          "equal to 0, but the value given is %d.",
+          shard_id));
+  PADDLE_ENFORCE_LT(
+      shard_id,
+      nshards,
+      phi::errors::InvalidArgument(
+          "The value 'shard_id' for Op(shard_index) must be less than "
+          "nshards (%d), but the value given is %d.",
+          nshards,
+          shard_id));
+
+  out->Resize(in.dims());
+  out->set_lod(in.lod());
+  auto* in_data = in.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = in.numel();
+  auto stream = dev_ctx.stream();
+  ShardIndexInner<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(
+      in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    shard_index, GPU, ALL_LAYOUT, phi::ShardIndexKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f9cda83a9a9876db302aa16c823beb3ba78b60b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+namespace phi {
+
+#ifdef __HIPCC__
+static constexpr int kNumCUDAThreads = 256;
+#else
+static constexpr int kNumCUDAThreads = 512;
+#endif
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+struct NonzeroFunctor {
+  HOSTDEVICE explicit inline NonzeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T x) const {
+    return static_cast<T>(static_cast<double>(x) != 0);
+  }
+};
+
+template <typename T>
+struct DivFunctor {
+  const T norm_;
+  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
+
+  HOSTDEVICE inline T operator()(T loss) {
+    loss /= norm_;
+    return loss;
+  }
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f61cd2c39674ecdad5553e8472fb85da73ebd534
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+
+#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+
+namespace phi {
+
+template <typename T>
+struct SigmoidBwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x,
+                                                const T label,
+                                                const T dout) {
+    T counts;
+    T dx_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data = static_cast<T>(0.);
+      counts = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) /
+                   (static_cast<T>(1) + paddle::operators::real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data = dout * diff;
+      counts = 1;
+    }
+    phi::Array<T, 2> outs;
+
+    outs[0] = dx_data;
+    outs[1] = counts;
+    return outs;
+  }
+};
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
+                                             const DenseTensor &x,
+                                             const DenseTensor &label,
+                                             const DenseTensor &out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor *in_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(in_grad);
+
+  // Temporary memory
+  DenseTensor *counts_tensor = new DenseTensor();
+
+  int64_t out_dims = label.numel() * sizeof(T);
+  counts_tensor->Resize({out_dims});
+  dev_ctx.template Alloc<T>(counts_tensor);
+  counts_tensor->Resize(in_grad->dims());
+
+  std::vector<const DenseTensor *> ins = {&x, &label, &out_grad};
+  std::vector<DenseTensor *> outs = {in_grad, counts_tensor};
+  auto functor = SigmoidBwdFunctor<T>(ignore_index);
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
+      dev_ctx, ins, &outs, functor);
+  if (normalize) {
+    DenseTensor *norm_tensor = new DenseTensor();
+    norm_tensor->Resize({sizeof(T)});
+    dev_ctx.template Alloc<T>(norm_tensor);
+    auto dims = phi::vectorize(counts_tensor->dims());
+    std::vector<int> reduce_dim = {};
+    for (int i = 0; i < dims.size(); i++) {
+      reduce_dim.push_back(i);
+    }
+
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
+    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
+    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         norm_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         norm,
+                         sizeof(T),
+                         dev_ctx.stream());
+    dev_ctx.Wait();
+    auto eps = static_cast<T>(1e-5);
+    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+    std::vector<const DenseTensor *> div_ins = {in_grad};
+    std::vector<DenseTensor *> div_outs = {in_grad};
+    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
+    delete norm_tensor;
+  }
+  delete counts_tensor;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0e9efe5bbafe671ae6f594564260895a2015a45
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+
+#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+
+namespace phi {
+
+template <typename T>
+struct SigmoidFwdFunctor {
+  T ignore_index_;
+  T eps = static_cast<T>(1e-5);
+
+  HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
+    T counts;
+    T out_data;
+
+    T diff = label - static_cast<T>(ignore_index_);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data = static_cast<T>(0.);
+      counts = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = paddle::operators::real_log(
+          static_cast<T>(1) +
+          paddle::operators::real_exp(static_cast<T>(-abs(x))));
+
+      out_data = term1 - term2 + term3;
+      counts = 1;
+    }
+    phi::Array<T, 2> outs;
+
+    outs[0] = out_data;
+    outs[1] = counts;
+    return outs;
+  }
+};
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
+                                         const DenseTensor &x,
+                                         const DenseTensor &label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor *out) {
+  auto out_data = dev_ctx.template Alloc<T>(out);
+
+  // Temporary memory
+  DenseTensor *counts_tensor = new DenseTensor();
+
+  int64_t out_dims = label.numel() * sizeof(T);
+  counts_tensor->Resize({out_dims});
+  dev_ctx.template Alloc<T>(counts_tensor);
+  counts_tensor->Resize(out->dims());
+
+  std::vector<const DenseTensor *> ins = {&x, &label};
+  std::vector<DenseTensor *> outs = {out, counts_tensor};
+  auto functor = SigmoidFwdFunctor<T>(ignore_index);
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
+      dev_ctx, ins, &outs, functor);
+  if (normalize) {
+    DenseTensor *norm_tensor = new DenseTensor();
+    norm_tensor->Resize({sizeof(T)});
+    dev_ctx.template Alloc<T>(norm_tensor);
+    auto dims = phi::vectorize(counts_tensor->dims());
+    std::vector<int> reduce_dim = {};
+    for (int i = 0; i < dims.size(); i++) {
+      reduce_dim.push_back(i);
+    }
+
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
+    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
+    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         norm_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         norm,
+                         sizeof(T),
+                         dev_ctx.stream());
+    dev_ctx.Wait();
+    auto eps = static_cast<T>(1e-5);
+    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+
+    std::vector<const DenseTensor *> div_ins = {out};
+    std::vector<DenseTensor *> div_outs = {out};
+    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
+    delete norm_tensor;
+  }
+  delete counts_tensor;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 03c5714b967841ef1bd124bd9191830a79567514..4a02f438c7e7e4f0c0212ee613ce78a7fac20909 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 PD_REGISTER_KERNEL(softmax,
                    GPU,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawKernel,
+                   phi::SoftmaxKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index d2473d5b0b110a122247c32c779b7a700c3249b1..83c2ec4b6e99d675bfbcab58abd265cc8595259c 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -27,6 +27,23 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
+  // need to infershape output
+  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
+    std::vector<MetaTensor> out_metas;
+    out_metas.reserve(outs.size());
+    std::vector<MetaTensor*> out_metas_ptr;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      out_metas.push_back(outs[i]);
+      out_metas_ptr.push_back(&out_metas.back());
+    }
+
+    phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr);
+
+    for (size_t i = 0; i < out_metas.size(); ++i) {
+      outs[i]->Resize(out_metas[i].dims());
+    }
+  }
+
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e09cfd370a4f0d87aaa081eaf50d519f536d2a72
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9665a917d9dc4a5fa0d350f8fb635ecec79b7832
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU device."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace  phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c092609e623d3f4f3dc4b3d77b1c973e6ddfbcf3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0c3c29e82c42aefae33a4a9be9e9a7d9ec0c1e99
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+
+PD_REGISTER_KERNEL(tile,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TileKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0b45223489e93dedf20d2ffa019a87a0a0e88bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // get the real the axis and the k
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+  const int& raw_height = in_dims[axis];
+
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  ops::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+
+  // lanuch the cuda kernel to assign the grad
+  ops::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f06af7de43f7ee234831203c485eaa0b8c86cbf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -0,0 +1,266 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // get the input dims
+  const auto& in_dims = input->dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    phi::DDim out_dims = out->dims();
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  const auto& out_dims = out->dims();
+
+  const T* input_data = input->data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    // if get the topK from the last axis
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+
+    if (k > input_width) {
+      k = input_width;
+    }
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+          &dev_ctx);
+      if (ops::SortTopk<T>(*ctx,
+                           input,
+                           input_width,
+                           input_height,
+                           k,
+                           out,
+                           indices,
+                           largest)) {
+        // Successed, return.
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+  } else {
+    // if get topK not from the last axis, will tranpose the tensor and get
+    // TopK
+
+    // first step, prepare the trans args for the tranpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, *input, &trans_input, trans);
+    // third step, calcluate the topk
+    // allocate the tmp cuda memory for the tmp result
+    DenseTensor trans_ind;
+    DenseTensor trans_out;
+    trans_ind.Resize(trans_out_dims);
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    if (k > input_width) k = input_width;
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+          &dev_ctx);
+      if (ops::SortTopk<T>(*ctx,
+                           &trans_input,
+                           input_width,
+                           input_height,
+                           k,
+                           &trans_out,
+                           &trans_ind,
+                           largest)) {
+        // last step, tranpose back the indices and output
+        funcs::TransCompute<phi::GPUContext, int64_t>(
+            ndims, dev_ctx, trans_ind, indices, trans);
+        funcs::TransCompute<phi::GPUContext, T>(
+            ndims, dev_ctx, trans_out, out, trans);
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, out, trans);
+  }
+}
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 7ac7c451b00542c3e0511692dc7cad470374f2ae..4a749c5b3347da24c8aba35d33673801b4b7e407 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
@@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx,
   T* out_data = ctx.template Alloc<T>(out);
   auto diag = funcs::Diagonal<T, Context>(ctx, &x, offset, axis1, axis2);
   if (diag.numel() > 0) {
-    auto stream = ctx.stream();
     std::vector<int> reduce_dims;
     reduce_dims.push_back(out->dims().size());
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims);
   } else {
     phi::funcs::SetConstant<Context, T> functor;
     functor(ctx, out, static_cast<T>(0));
diff --git a/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7eaa485797947ae7b6a60378737d8c955718466
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f137d8e1c260387686cbf3d0fbadf686d9e13019
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // calculate use cublas library
+  CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
+  CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
+  CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
+
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  auto lda = std::max(1, M);
+  auto ldb = std::max(1, N);
+
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
+  if (batch_size <= 8 && M >= 64) {
+    for (auto i = 0; i < batch_size; i++) {
+      blas.TRSM(CblasLeft,
+                uplo,
+                transA,
+                diag,
+                M,
+                N,
+                T(1),
+                x_bst_data + i * M * M,
+                lda,
+                out_data + i * N * M,
+                ldb);
+    }
+  } else {
+    std::vector<const T*> cpu_ptrs(batch_size * 2);
+    for (int i = 0; i < batch_size; ++i) {
+      cpu_ptrs[i] = x_bst_data + i * M * M;
+      cpu_ptrs[i + batch_size] = out_data + i * M * N;
+    }
+
+    // Copy the addresses of A and tmp_b from host to device.
+    paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+        paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_ptrs_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         static_cast<void*>(cpu_ptrs.data()),
+                         cpu_ptrs.size() * sizeof(T*),
+                         dev_ctx.stream());
+
+    const T** gpu_a_ptrs =
+        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
+    T** gpu_b_ptrs =
+        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+    blas.BatchedTRSM(CblasLeft,
+                     uplo,
+                     transA,
+                     diag,
+                     M,
+                     N,
+                     static_cast<T>(1.0),
+                     gpu_a_ptrs,
+                     lda,
+                     gpu_b_ptrs,
+                     ldb,
+                     batch_size);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 12c1bf791e1691bb6eee81750b337adea713b794..bb04e7ee8515bb6320860e4fd20366995d26c991 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -25,7 +25,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/fluid/framework/generator.h"
-// #include "paddle/phi/core/generator.h"
 
 namespace phi {
 
@@ -34,23 +33,27 @@ struct GPUTruncatedNormal {
   T mean, std;
   T a_normal_cdf;
   T b_normal_cdf;
+
   unsigned int seed;
   T numeric_min;
 
   __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
       : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+    auto normal_cdf = [](float x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf((-2.0 - mean) / std);
+    b_normal_cdf = normal_cdf((2.0 - mean) / std);
   }
 
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    thrust::uniform_real_distribution<T> dist(2.0 * a_normal_cdf - 1.0,
+                                              2.0 * b_normal_cdf - 1.0);
     rng.discard(n);
     T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+    return std::sqrt(2.0) * erfinvf(value) * std + mean;
   }
 };
 
@@ -70,24 +73,27 @@ struct TruncatedNormalOffset {
         seed(seed),
         numeric_min(numeric_min),
         offset_(offset) {
-    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
-    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+    auto normal_cdf = [](float x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf((-2.0 - mean) / std);
+    b_normal_cdf = normal_cdf((2.0 - mean) / std);
   }
 
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    thrust::uniform_real_distribution<T> dist(2.0 * a_normal_cdf - 1.0,
+                                              2.0 * b_normal_cdf - 1.0);
     rng.discard(n + offset_);
     T value = dist(rng);
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+    return std::sqrt(2.0) * erfinvf(value) * std + mean;
   }
 };
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25d6d46c20b9fd1f58a1299a82214641d10a4149
--- /dev/null
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -0,0 +1,402 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+int64_t ComputeBlockSize(int64_t col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256)
+    return 512;
+  else if (col > 128)
+    return 256;
+  else if (col > 64)
+    return 128;
+  else if (col > 32)
+    return 64;
+  else if (col > 16)
+    return 32;
+  else if (col > 8)
+    return 16;
+  else
+    return 8;
+}
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    std::vector<const DenseTensor*> ins{&lhs, &rhs};
+    std::vector<DenseTensor*> outs{output};
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                   T,
+                                                   T>(
+        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    std::vector<const DenseTensor*> ins = {&lhs, &rhs};
+    std::vector<DenseTensor*> outs = {mask};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
+  }
+};
+
+template <typename T, typename IndType, size_t BlockDim>
+__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
+                                 const int64_t width,      // c
+                                 const int64_t post_size,  // h
+                                 const T* in,
+                                 IndType* out_idx,
+                                 T* out) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  cub::ArgMax reducer;
+  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    cub::KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      // return max, argmax
+      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
+      if (out != nullptr) out[idx] = kv_pair.value;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int start = idx; idx < num; idx += gridDim.x) {
+    data[idx] = idx * scale;
+  }
+}
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int num,
+                  int64_t scale) {
+    int64_t kBlockDim = ComputeBlockSize(num);
+    // kBlockDim > num at most of time, so we can set grid = 1
+    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
+  }
+};
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t numel = input.numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    auto cu_stream = dev_ctx.stream();
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = pre * post;
+    int64_t width = n;
+    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           IndType,
+                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, in_data, out_idx_data, out_data));
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    DenseTensor out_data;
+    out_data.Resize(phi::make_ddim({1}));
+    dev_ctx.template Alloc<T>(&out_data);
+    switch (ComputeBlockSize(input.numel())) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           T,
+                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
+              1,
+              input.numel(),
+              1,
+              input.data<int64_t>(),
+              nullptr,
+              out_data.data<int64_t>()));
+    }
+    DenseTensor max_value_tensor;
+    phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
+    *max_value = max_value_tensor.data<T>()[0];
+  }
+};
+
+template <typename Context, typename T, typename IndexT>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, GPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..535cb812a20ea90bdb3f07b731af52c2822f0ec2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GetTrueNum(const T *cond_data,
+                           const int64_t numel,
+                           int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    true_num_array[idx] =
+        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+  }
+}
+
+template <typename T>
+__global__ void SetTrueIndex(int64_t *out_ptr,
+                             const T *cond_data,
+                             const int64_t numel,
+                             const int64_t *stride_array,
+                             const int64_t rank,
+                             const int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    // true_num_array is calculated by cub::InclusiveSum,
+    // cause the first element of true_num_array is 1,
+    // so we need substract 1 to get true index.
+    const int64_t true_index = true_num_array[idx] - 1;
+    if (static_cast<bool>(cond_data[idx])) {
+      int64_t rank_index = idx;
+      for (int j = 0; j < rank; j++) {
+        const int64_t out_index = rank_index / stride_array[j];
+        out_ptr[true_index * rank + j] = out_index;
+        rank_index -= out_index * stride_array[j];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context &dev_ctx,
+                      const DenseTensor &condition,
+                      DenseTensor *out) {
+  const T *cond_data = condition.data<T>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  auto d_array_mem =
+      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
+  auto h_array_mem =
+      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
+
+  // "stride_array" is an array and len(stride_array)==rank,
+  // each element is the stride of each dimension -- the length from i to i+1.
+  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
+  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
+
+  // "true_num_array" is an array and len(stride_array)==numel,
+  // at the beginning,
+  // "true_num_array" will set 1 if condition[i] == true else 0,
+  // then it will be calculated by cub::InclusiveSum,
+  // so that we can get the true number before i as the out index
+  int64_t *d_true_num_array = d_stride_array + rank;
+
+  // the total_true_num is the total number of condition[i] == true
+  int64_t *h_total_true_num = h_stride_array + rank;
+
+  // alloce cub memory
+  size_t cub_size = 0;
+  cub::DeviceScan::InclusiveSum(nullptr,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
+  void *cub_data = cub_mem->ptr();
+
+  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
+  const int threads = std::min(numel, static_cast<int64_t>(128));
+  const int64_t need_grids = (numel + threads - 1) / threads;
+  const int grids = std::min(need_grids, static_cast<int64_t>(256));
+  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      cond_data, numel, d_true_num_array);
+
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  cub::DeviceScan::InclusiveSum(cub_data,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+
+  // calculate each dimension's stride
+  h_stride_array[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
+  }
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       d_stride_array,
+                       phi::CPUPlace(),
+                       h_stride_array,
+                       rank * sizeof(int64_t),
+                       dev_ctx.stream());
+
+  // get total ture number and set output size
+  // the last element of cub::InclusiveSum is the total number
+  paddle::memory::Copy(phi::CPUPlace(),
+                       h_total_true_num,
+                       dev_ctx.GetPlace(),
+                       d_true_num_array + numel - 1,
+                       sizeof(int64_t),
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+
+  int64_t true_num = *h_total_true_num;
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  // using true_num_array and stride_array to calculate the output index
+  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2719dcd9e54094f2b6af0ad18eabb445081d60a6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input,
+                            const int* imgsize,
+                            T* boxes,
+                            T* scores,
+                            const float conf_thresh,
+                            const int* anchors,
+                            const int n,
+                            const int h,
+                            const int w,
+                            const int an_num,
+                            const int class_num,
+                            const int box_num,
+                            int input_size_h,
+                            int input_size_w,
+                            bool clip_bbox,
+                            const float scale,
+                            const float bias,
+                            bool iou_aware,
+                            const float iou_aware_factor) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 4, iou_aware);
+    T conf = funcs::sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx =
+          funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = funcs::sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 0, iou_aware);
+    funcs::GetYoloBox<T>(box,
+                         input,
+                         anchors,
+                         l,
+                         k,
+                         j,
+                         h,
+                         w,
+                         input_size_h,
+                         input_size_w,
+                         box_idx,
+                         grid_num,
+                         img_height,
+                         img_width,
+                         scale,
+                         bias);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    funcs::CalcDetectionBox<T>(
+        boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 5, iou_aware);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    funcs::CalcLabelScore<T>(
+        scores, input, label_idx, score_idx, class_num, conf, grid_num);
+  }
+}
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  int bytes = sizeof(int) * anchors.size();
+  auto anchors_ptr =
+      paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
+  int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+  const auto gplace = dev_ctx.GetPlace();
+  const auto cplace = phi::CPUPlace();
+  paddle::memory::Copy(
+      gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = img_size.data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+  set_zero(dev_ctx, boxes, static_cast<T>(0));
+  set_zero(dev_ctx, scores, static_cast<T>(0));
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * box_num);
+
+  dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+  if (config.compute_capability == 53 || config.compute_capability == 62) {
+    thread_num = 512;
+  }
+#endif
+
+  KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
+      input_data,
+      imgsize_data,
+      boxes_data,
+      scores_data,
+      conf_thresh,
+      anchors_data,
+      n,
+      h,
+      w,
+      an_num,
+      class_num,
+      box_num,
+      input_size_h,
+      input_size_w,
+      clip_bbox,
+      scale,
+      bias,
+      iou_aware,
+      iou_aware_factor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, GPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b4a6fe337c8d21e37beb0d6e5219e1a5edf1f9e8
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -0,0 +1,834 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  auto X = &input;
+  auto W = &filter;
+  auto dO = &out_grad;
+  auto ddX = input_grad_grad.get_ptr();
+  auto ddW = filter_grad_grad.get_ptr();
+
+  auto ddO = out_grad_grad;
+  auto dW = filter_grad;
+  auto dX = input_grad;
+  if (ddO) {
+    ddO->mutable_data<T>(ctx.GetPlace());
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, ddO, static_cast<T>(0));
+  }
+  if (dW) {
+    dW->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dX) {
+    dX->mutable_data<T>(ctx.GetPlace());
+  }
+
+  // const T* x = X->data<T>();
+  const T* dy = dO->data<T>();
+  const T* w = W->data<T>();
+
+  const T* ddx = nullptr;
+  const T* ddw = nullptr;
+  T *dw, *dx, *ddy;
+  dw = dx = ddy = nullptr;
+  T* transformed_dx = nullptr;
+  std::vector<int> dilations = dilations_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  std::vector<int> paddings = paddings_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensors to channel first-----------
+  DenseTensor transformed_X_channel(X->type());
+  DenseTensor transformed_dO_channel(dO->type());
+  DenseTensor transformed_ddX_channel(X->type());
+
+  DenseTensor transformed_ddO_channel(dO->type());
+  DenseTensor transformed_dX_channel(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+    TransToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+    TransToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+      TransToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+    }
+
+    if (ddO) {
+      ResizeToChannelFirst<Context, T>(ctx, ddO, &transformed_ddO_channel);
+    }
+    if (dX) {
+      ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
+      transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
+    }
+
+  } else {
+    transformed_X_channel = *X;
+    transformed_dO_channel = *dO;
+    if (ddX) {
+      transformed_ddX_channel = *ddX;
+    }
+    if (ddO) {
+      transformed_ddO_channel.ShareDataWith(*ddO);
+    }
+    if (dX) {
+      transformed_dX_channel.ShareDataWith(*dX);
+    }
+  }
+
+  auto in_dims = transformed_X_channel.dims();
+  auto filter_dims = W->dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_ddX(X->type());
+
+  DenseTensor transformed_dX(X->type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(X->dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_X_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_X_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_X_channel.dims()[i + 2] + padding_diff[i];
+      input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_X.Resize(new_input_shape);
+    transformed_ddX.Resize(new_input_shape);
+    transformed_dX.Resize(new_input_shape);
+
+    transformed_X.mutable_data<T>(ctx.GetPlace());
+
+    if (ddX) {
+      transformed_ddX.mutable_data<T>(ctx.GetPlace());
+    }
+    if (dX) {
+      transformed_dX.mutable_data<T>(ctx.GetPlace());
+    }
+
+    // pad for input
+    const int rank = X->dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 4>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 5>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+
+  } else {
+    transformed_X.ShareDataWith(transformed_X_channel);
+    if (ddX) {
+      transformed_ddX.ShareDataWith(transformed_ddX_channel);
+    }
+    if (dX) {
+      transformed_dX.ShareDataWith(transformed_dX_channel);
+    }
+
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* x = transformed_X.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  auto handle = ctx.cudnn_handle();
+
+  paddle::operators::ConvArgs args1{&transformed_ddX,
+                                    W,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_X,
+                                    ddW,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args3{&transformed_ddX,
+                                    dW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args4{&transformed_dX,
+                                    ddW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionFwdAlgo_t fwd_algo1 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionFwdAlgo_t fwd_algo2 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
+
+  // ddo = conv(ddI, W) + conv(I, ddW)
+  size_t workspace_size = 0;
+
+  T* transformed_ddy_channel = nullptr;
+  if (ddO) {
+    ddy = ddO->data<T>();
+    transformed_ddy_channel = transformed_ddO_channel.data<T>();
+    if (ddX) {
+      args1.handle = handle;
+      args1.idesc.set(transformed_ddX, iwo_group);
+      args1.wdesc.set(*W, layout, iwo_group);
+      args1.odesc.set(transformed_ddO_channel, iwo_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+
+#ifdef PADDLE_WITH_HIP
+      using search1 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size = search1::GetWorkspaceSize(args1);
+      fwd_algo1 = search1::Find<T>(
+          args1, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search1 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+#endif
+    }
+
+    if (ddW) {
+      ddw = ddW->data<T>();
+      args2.handle = handle;
+      args2.idesc.set(transformed_X, iwo_group);
+      args2.wdesc.set(*ddW, layout, iwo_group);
+      args2.odesc.set(transformed_ddO_channel, iwo_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+
+#ifdef PADDLE_WITH_HIP
+      using search2 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      fwd_algo2 = search2::Find<T>(
+          args2, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search2 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2));
+#endif
+    }
+  }
+
+  if (dW && ddX) {
+    dw = dW->data<T>();
+    args3.handle = handle;
+    args3.idesc.set(transformed_ddX, iwo_group);
+    args3.wdesc.set(*dW, layout, iwo_group);
+    args3.odesc.set(transformed_dO_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+
+#ifdef PADDLE_WITH_HIP
+    using search3 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_algo = search3::Find<T>(
+        args3, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search3 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+#endif
+  }
+
+  if (ddW && dX) {
+    transformed_dx = transformed_dX.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dX, iwo_group);
+    args4.wdesc.set(*ddW, layout, iwo_group);
+    args4.odesc.set(transformed_dO_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+
+#ifdef PADDLE_WITH_HIP
+    using search4 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    data_algo = search4::Find<T>(
+        args4, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search4 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(
+      transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dO_channel.dims(),
+           DataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = W->numel() / groups;
+
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+  // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+  // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+  // 0.0f;
+  // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+  auto wkspace_handle = ctx.cudnn_workspace_handle();
+
+  if (ddO) {
+    if (ddX) {
+      ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args1.idesc.desc(),
+                    ddx,
+                    args1.wdesc.desc(),
+                    w,
+                    args1.cdesc.desc(),
+                    fwd_algo1,
+                    &beta,
+                    args1.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args1.idesc.desc(),
+                      ddx + i * group_offset_in,
+                      args1.wdesc.desc(),
+                      w + i * group_offset_filter,
+                      args1.cdesc.desc(),
+                      fwd_algo1,
+                      workspace_ptr,
+                      workspace_size,
+                      &beta,
+                      args1.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (ddW) {
+#ifdef PADDLE_WITH_HIP
+      // MIOPEN ONLY support beta to be 0.0f
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    x,
+                    args2.wdesc.desc(),
+                    ddw,
+                    args2.cdesc.desc(),
+                    fwd_algo2,
+                    &beta,
+                    args2.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args2.idesc.desc(),
+                      x + i * group_offset_in,
+                      args2.wdesc.desc(),
+                      ddw + i * group_offset_filter,
+                      args2.cdesc.desc(),
+                      fwd_algo2,
+                      workspace_ptr,
+                      workspace_size,
+                      &alpha,
+                      args2.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_ddO_channel, ddO);
+    }
+  }
+  T* transformed_dy_channel = transformed_dO_channel.data<T>();
+  if (dW && ddX) {
+    ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args3.odesc.desc(),
+                  transformed_dy_channel,
+                  args3.idesc.desc(),
+                  ddx,
+                  args3.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args3.wdesc.desc(),
+                  dw,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args3.idesc.desc(),
+                    ddx + i * group_offset_in,
+                    args3.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dw + i * group_offset_filter));
+          },
+          workspace_size);
+    }
+#endif
+  }
+
+  if (dX && ddW) {
+    ddw = ddW->data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardData(
+                  handle,
+                  &alpha,
+                  args4.odesc.desc(),
+                  transformed_dy_channel,
+                  args4.wdesc.desc(),
+                  ddw,
+                  args4.cdesc.desc(),
+                  data_algo,
+                  &beta,
+                  args4.idesc.desc(),
+                  transformed_dx,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args4.wdesc.desc(),
+                    ddw + i * group_offset_filter,
+                    args4.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args4.cdesc.desc(),
+                    data_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args4.idesc.desc(),
+                    transformed_dx + i * group_offset_in));
+          },
+          workspace_size);
+    }
+#endif
+
+    if (!is_sys_pad) {
+      // reverse padded input
+      std::vector<int> starts(X->dims().size(), 0);
+      std::vector<int> axes(X->dims().size(), 0);
+
+      for (size_t i = 0; i < X->dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+      if (X->dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_dX_channel, dX);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void DepthwiseConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    bool fuse_relu,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+#else
+
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+#endif
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64148e902fdb2123aa3f81846999b5d90f356cd6
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -0,0 +1,683 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnGradKernel(const Context& ctx,
+                         const DenseTensor& output_grad,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search_t,
+                         DenseTensor* input_grad,
+                         DenseTensor* filter_grad) {
+  if (input_grad) {
+    input_grad->mutable_data<T>(ctx.GetPlace());
+  }
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(ctx.GetPlace());
+  }
+
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvGradOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+
+  // transform Tensor
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output_grad_channel(output_grad.type());
+  DenseTensor transformed_input_grad_channel(input.type());
+  DenseTensor transformed_filter_channel(filter.type());
+  DenseTensor transformed_filter_grad_channel(filter.type());
+
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
+               "NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+
+    ResizeToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+    TransToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+
+    if (input_grad) {
+      ResizeToChannelFirst<Context, T>(
+          ctx, input_grad, &transformed_input_grad_channel);
+      // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
+      // the data of input_grad to transformed_input_grad_channel.
+      if (use_addto) {
+        TransToChannelFirst<Context, T>(
+            ctx, input_grad, &transformed_input_grad_channel);
+      }
+    }
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output_grad_channel.ShareDataWith(output_grad);
+    if (input_grad) {
+      transformed_input_grad_channel.ShareDataWith(*input_grad);
+    }
+  }
+
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+
+    if (filter_grad) {
+      ResizeToChannelLast<Context, T>(
+          ctx, filter_grad, &transformed_filter_grad_channel);
+    }
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+    if (filter_grad) {
+      transformed_filter_grad_channel.ShareDataWith(*filter_grad);
+    }
+  }
+
+  //  update paddings
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  // cuDNN only supports padding the same amount on every dimension.
+  // So we create a new padded input tensor.
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  Tensor transformed_input(input.type());
+  Tensor transformed_input_grad(input.type());
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+
+    transformed_input_grad.Resize(new_input_shape);
+
+    if (input_grad) {
+      transformed_input_grad.mutable_data<T>(ctx.GetPlace());
+    }
+    // pad for input
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (input_grad) {
+      transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
+    }
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* input_data = transformed_input.data<T>();
+  const T* output_grad_data = transformed_output_grad_channel.data<T>();
+  const T* filter_data = transformed_filter_channel.data<T>();
+  T* filter_grad_data = nullptr;
+  T* input_grad_data = nullptr;
+  T* transformed_input_grad_data = nullptr;
+
+  paddle::operators::ConvArgs args1{&transformed_input_grad,
+                                    &transformed_filter_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_input,
+                                    &transformed_filter_grad_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+
+  auto handle = ctx.cudnn_handle();
+  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+// ------------------- cudnn backward algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+  // input data workspace_size
+  size_t workspace_size_d = 0;
+  // weight workspace_size
+  size_t workspace_size_w = 0;
+  int iwo_groups = groups;
+  int c_groups = 1;
+
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  if (input_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    input_grad_data = input_grad->data<T>();
+    transformed_input_grad_data = transformed_input_grad.data<T>();
+
+    args1.handle = handle;
+    args1.idesc.set(transformed_input_grad, layout_tensor);
+    args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
+    args1.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
+    data_algo = search1::Find<T>(
+        args1, exhaustive_search, deterministic, workspace_size_d, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo));
+#endif
+  }
+
+  if (filter_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    filter_grad_data = transformed_filter_grad_channel.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_input, layout_tensor);
+    args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups);
+    args2.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size_w =
+        std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
+    filter_algo = search2::Find<T>(
+        args2, exhaustive_search, deterministic, workspace_size_w, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
+    workspace_size_w = std::max(workspace_size_w,
+                                search2::GetWorkspaceSize(args2, filter_algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN ONLY support beta to be 0.0f
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+#else
+  paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+
+#endif
+  VLOG(4) << "Conv_grad: use_addto = " << use_addto;
+
+  if (input_grad) {
+// When beta is 0, it is unnecessary to reset input_grad.
+// When beta is 1, the output cannot be reset since addt strategy used.
+#ifdef PADDLE_WITH_HIP
+    if (use_addto) {
+      DenseTensor temp_tensor(transformed_input_grad.type());
+      temp_tensor.Resize(transformed_input_grad.dims());
+      T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    temp_tensor_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args1.idesc.desc(),
+          transformed_input_grad_data,
+          &alpha,
+          args1.idesc.desc(),
+          temp_tensor_data,
+          &beta,
+          args1.idesc.desc(),
+          transformed_input_grad_data));
+    } else {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+    }
+
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.wdesc.desc(),
+                    filter_data + i * group_offset_filter,
+                    args1.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_d,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data + i * group_offset_in));
+          },
+          workspace_size_d);
+    }
+#endif
+    if (!is_sys_pad) {
+      std::vector<int> starts(transformed_input_channel.dims().size(), 0);
+      std::vector<int> axes(transformed_input_channel.dims().size(), 0);
+
+      for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+
+      transformed_input_grad_channel.mutable_data(ctx.GetPlace());
+      if (transformed_input_channel.dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      }
+    }
+
+    if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+      TransToChannelLast<Context, T>(
+          ctx, &transformed_input_grad_channel, input_grad);
+    }
+  }
+
+  // filter_grad do not use inplace addto.
+  paddle::operators::ScalingParamType<T> beta_filter = 0.0f;
+  // ------------------- cudnn conv backward filter ---------------------
+  if (filter_grad) {
+// Because beta is zero, it is unnecessary to reset filter_grad.
+#ifdef PADDLE_WITH_HIP
+    workspace_handle.RunFunc(
+        [&](void* cudnn_workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args2.odesc.desc(),
+                  output_grad_data,
+                  args2.idesc.desc(),
+                  input_data,
+                  args2.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args2.wdesc.desc(),
+                  filter_grad_data,
+                  cudnn_workspace_ptr,
+                  workspace_size_w));
+        },
+        workspace_size_w);
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    input_data + i * group_offset_in,
+                    args2.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args2.cdesc.desc(),
+                    filter_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_w,
+                    &beta_filter,
+                    args2.wdesc.desc(),
+                    filter_grad_data + i * group_offset_filter));
+          },
+          workspace_size_w);
+    }
+#endif
+
+    if (compute_format == paddle::platform::DataLayout::kNHWC) {
+      TransToChannelFirst<Context, T>(
+          ctx, &transformed_filter_grad_channel, filter_grad);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnGradKernel(const Context& dev_ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& input,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& paddding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           bool use_addto,
+                           int workspace_size_MB,
+                           bool exhaustive_search,
+                           DenseTensor* input_grad,
+                           DenseTensor* filter_grad) {
+  ConvCudnnGradKernel<T>(dev_ctx,
+                         out_grad,
+                         input,
+                         filter,
+                         strides,
+                         paddings,
+                         paddding_algorithm,
+                         groups,
+                         dilations,
+                         data_format,
+                         use_addto,
+                         workspace_size_MB,
+                         exhaustive_search,
+                         input_grad,
+                         filter_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+#endif
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..931b6d68845e27297784603c2427178eae6b6f7d
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -0,0 +1,476 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvCudnnKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings_t,
+                     const std::string& padding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations_t,
+                     const std::string& data_format,
+                     bool use_addto,
+                     int workspace_size_MB,
+                     bool exhaustive_search_t,
+                     DenseTensor* output) {
+  output->mutable_data<T>(ctx.GetPlace());
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  // Tensor Core introduced from Volta GPUs supports more faster conv op
+  // with FP16 in NHWC data format.
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  // We will only do data format conversion from NHWC to NCHW.
+  // cudnn will convert NCHW to NHWC automatically on Tensor Core.
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+
+  // ------------ transformed tensor -----------
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_filter_channel(filter.type());
+  T* output_data = nullptr;
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, output, &transformed_output);
+
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output.ShareDataWith(*output);
+  }
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+  }
+  output_data = transformed_output.data<T>();
+
+  // update padding and dilation
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+
+  DenseTensor transformed_input;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+
+    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+
+  const T* input_data = transformed_input.data<T>();
+
+  const T* filter_data = transformed_filter_channel.data<T>();
+
+  // ------------------- cudnn descriptors ---------------------
+  paddle::operators::ConvArgs args{&transformed_input,
+                                   &transformed_filter_channel,
+                                   &transformed_output,
+                                   strides,
+                                   padding_common,
+                                   dilations,
+                                   dtype};
+
+  auto handle = ctx.cudnn_handle();
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
+
+  args.handle = handle;
+
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN need to set groups in cdesc in miopen_desc.h
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn(),
+                 groups);
+#else
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn());
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // cudnn 7 can support groups, no need to do it manually
+  // FIXME(typhoonzero): find a better way to disable groups
+  // rather than setting it to 1.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetConvolutionGroupCount(
+          args.cdesc.desc(), groups));
+  groups = 1;
+#endif
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN do not set groups in wdesc after set groups in cdesc
+  groups = 1;
+#endif
+  args.idesc.set(transformed_input, layout_format);
+  args.wdesc.set(transformed_filter_channel, layout_format, groups);
+  args.odesc.set(transformed_output, layout_format);
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+  // ------------------- cudnn conv workspace ---------------------
+  size_t workspace_size = 0;  // final workspace to allocate.
+// ------------------- cudnn conv algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t algo{};
+  using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+  workspace_size = search::GetWorkspaceSize(args);
+  algo = search::Find<T>(
+      args, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+  cudnnConvolutionFwdAlgo_t algo{};
+  using search =
+      paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+  algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
+  workspace_size = search::GetWorkspaceSize(args, algo);
+#endif
+
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
+    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
+  // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
+    // FWD_ALGO_IMPLICIT_GEMM manually.
+  if (groups > 1) {
+    algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  }
+#endif
+
+  // ------------------- cudnn conv forward ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+
+#ifdef PADDLE_WITH_HIP
+  workspace_handle.RunFunc(
+      [&](void* workspace_ptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args.idesc.desc(),
+                input_data,
+                args.wdesc.desc(),
+                filter_data,
+                args.cdesc.desc(),
+                algo,
+                &beta,
+                args.odesc.desc(),
+                output_data,
+                workspace_ptr,
+                workspace_size));
+      },
+      workspace_size);
+#else
+  for (int i = 0; i < groups; i++) {
+    workspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::cudnnConvolutionForward(
+                  handle,
+                  &alpha,
+                  args.idesc.desc(),
+                  input_data + i * group_offset_in,
+                  args.wdesc.desc(),
+                  filter_data + i * group_offset_filter,
+                  args.cdesc.desc(),
+                  algo,
+                  workspace_ptr,
+                  workspace_size,
+                  &beta,
+                  args.odesc.desc(),
+                  output_data + i * group_offset_out));
+        },
+        workspace_size);
+  }
+#endif
+
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    TransToChannelLast<Context, T>(ctx, &transformed_output, output);
+  }
+}
+
+template <typename T, typename Context>
+void Conv3DCudnnKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& filter,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const std::string& padding_algorithm,
+                       int groups,
+                       const std::vector<int>& dilations,
+                       const std::string& data_format,
+                       bool use_addto,
+                       int workspace_size_MB,
+                       bool exhaustive_search,
+                       DenseTensor* out) {
+  ConvCudnnKernel<T>(dev_ctx,
+                     input,
+                     filter,
+                     strides,
+                     paddings,
+                     padding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+
+#endif
+
+// todo register bfloat16
diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cf2c991464fc6e091eee0bc75641d7abae8598c
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+using PoolingMode = paddle::platform::PoolingMode;
+using ScopedPoolingDescriptor = paddle::platform::ScopedPoolingDescriptor;
+using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor;
+
+template <typename T>
+using ScalingParamType =
+    typename paddle::platform::CudnnDataType<T>::ScalingParamType;
+
+inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) {
+  if (data_format == "NHWC") {
+    return GPUDNNDataLayout::kNHWC;
+  } else if (data_format == "NCHW") {
+    return GPUDNNDataLayout::kNCHW;
+  } else if (data_format == "NCDHW") {
+    return GPUDNNDataLayout::kNCDHW;
+  } else {
+    return GPUDNNDataLayout::kNCDHW;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b731d03347024ccd76eafc02c7096f3633948eb5
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -0,0 +1,448 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"  //  PoolGradRawGPUDNNKernel will call PoolGradRawKernel for pooling type "max" in ROCm
+#endif
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolGradRawGPUDNNKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int>& kernel_size,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             bool exclusive,
+                             const std::string& data_format,
+                             const std::string& pooling_type,
+                             bool global_pooling,
+                             bool adaptive,
+                             const std::string& padding_algorithm,
+                             DenseTensor* dx) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(ctx.GetPlace()),
+      true,
+      errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace "
+                              "rather than CPUPlace."));
+
+  const DenseTensor* input = &x;
+  const DenseTensor* output = &out;
+  const DenseTensor* output_grad = &dout;
+  DenseTensor* input_grad = dx;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+#ifdef PADDLE_WITH_HIP
+  if (pooling_type == "max") {
+    PoolGradRawKernel<T, GPUContext>(ctx,
+                                     x,
+                                     out,
+                                     dout,
+                                     kernel_size,
+                                     strides,
+                                     paddings_,
+                                     exclusive,
+                                     data_format,
+                                     pooling_type,
+                                     global_pooling,
+                                     adaptive,
+                                     padding_algorithm,
+                                     dx);
+    return;
+  }
+#endif
+
+  // update paddings
+  auto in_x_dims = input->dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(in_x_dims, 2, in_x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  // ------- tensor grad --------------
+  DenseTensor transformed_input(input->type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_output_grad(output_grad->type());
+
+  ctx.template Alloc<T>(input_grad);
+  DenseTensor transformed_input_grad(input_grad->type());
+  GPUDNNDataLayout layout;
+  const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+  const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+  if (data_format == str_NDHWC) {
+    layout = GPUDNNDataLayout::kNCDHW;
+    std::vector<int> axis{0, 4, 1, 2, 3};
+
+    // input
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 5> trans5;
+    trans5(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[4];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    out_dims_vec[4] = output->dims()[3];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+
+    ctx.Alloc(&transformed_output, output->type());
+
+    funcs::Transpose<Context, T, 5> trans5_v2;
+    trans5_v2(ctx, *output, &transformed_output, axis);
+
+    // output grad
+    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output_grad, output_grad->type());
+
+    funcs::Transpose<Context, T, 5> trans5_v3;
+    trans5_v3(ctx, *output_grad, &transformed_output_grad, axis);
+
+    // input grad
+    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+  } else if (data_format == str_NHWC) {
+    layout = GPUDNNDataLayout::kNCHW;
+
+    std::vector<int> axis{0, 3, 1, 2};
+
+    // input
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 4> trans4;
+    trans4(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[3];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output, output->type());
+
+    funcs::Transpose<Context, T, 4> trans4_v2;
+    trans4_v2(ctx, *output, &transformed_output, axis);
+
+    // output grad
+    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    ctx.Alloc(&transformed_output_grad, output_grad->type());
+
+    funcs::Transpose<Context, T, 4> trans4_v3;
+    trans4_v3(ctx, *output_grad, &transformed_output_grad, axis);
+
+    // input grad
+    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+#endif
+  } else {
+    layout = GetLayoutFromStr(data_format);
+    transformed_input = *input;
+    transformed_output = *output;
+    transformed_output_grad = *output_grad;
+    transformed_input_grad = *input_grad;
+  }
+
+  const T* input_data = transformed_input.data<T>();
+  const T* output_data = transformed_output.data<T>();
+  const T* output_grad_data = transformed_output_grad.data<T>();
+
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor input_desc;
+  ScopedTensorDescriptor output_desc;
+  ScopedPoolingDescriptor pool_desc;
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#else
+  cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#endif
+  PoolingMode pooling_mode;
+  if (pooling_type == "max") {
+    if (FLAGS_cudnn_deterministic) {
+      pooling_mode = PoolingMode::kMaximumDeterministic;
+    } else {
+      pooling_mode = PoolingMode::kMaximum;
+    }
+  } else {
+    pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                             : PoolingMode::kAverageInclusive;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#else
+  cudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#endif
+
+  // ------------------- cudnn pool algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+  if (input_grad) {
+    T* input_grad_data = ctx.template Alloc<T>(&transformed_input_grad);
+// Because beta is zero, it is unnecessary to reset input_grad.
+#ifdef PADDLE_WITH_HIP
+    char* pool_workspace;
+    size_t pool_worksize = 0;
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2(
+        cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingBackward(handle,
+                                                              cudnn_pool_desc,
+                                                              &alpha,
+                                                              cudnn_output_desc,
+                                                              output_data,
+                                                              cudnn_output_desc,
+                                                              output_grad_data,
+                                                              cudnn_input_desc,
+                                                              input_data,
+                                                              &beta,
+                                                              cudnn_input_desc,
+                                                              input_grad_data,
+                                                              pool_workspace));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle,
+                                                             cudnn_pool_desc,
+                                                             &alpha,
+                                                             cudnn_output_desc,
+                                                             output_data,
+                                                             cudnn_output_desc,
+                                                             output_grad_data,
+                                                             cudnn_input_desc,
+                                                             input_data,
+                                                             &beta,
+                                                             cudnn_input_desc,
+                                                             input_grad_data));
+#endif
+
+    if (data_format == str_NDHWC) {
+      std::vector<int> axis{0, 2, 3, 4, 1};
+      funcs::Transpose<Context, T, 5> trans5_v4;
+      trans5_v4(ctx, transformed_input_grad, input_grad, axis);
+    }
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+    if (data_format == str_NHWC) {
+      std::vector<int> axis{0, 2, 3, 1};
+      funcs::Transpose<Context, T, 4> trans4_v4;
+      trans4_v4(ctx, transformed_input_grad, input_grad, axis);
+    }
+#endif
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx) {
+  PoolGradRawGPUDNNKernel<T, Context>(ctx,
+                                      x,
+                                      out,
+                                      dout,
+                                      kernel_size,
+                                      strides,
+                                      paddings,
+                                      exclusive,
+                                      data_format,
+                                      pooling_type,
+                                      global_pooling,
+                                      adaptive,
+                                      padding_algorithm,
+                                      dx);
+}
+
+template <typename T, typename Context>
+void Pool2dDoubleGradGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool ceil_mode,
+                                  bool exclusive,
+                                  const std::string& data_format,
+                                  const std::string& pooling_type,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  const std::string& padding_algorithm,
+                                  DenseTensor* out) {
+  if (pooling_type == "max") {
+    PADDLE_THROW(
+        errors::InvalidArgument("Pool op grad grad only supports avgpool."));
+  } else {
+    Pool2dGPUDNNKernel<T, Context>(ctx,
+                                   x,
+                                   kernel_size,
+                                   strides,
+                                   paddings,
+                                   ceil_mode,
+                                   exclusive,
+                                   data_format,
+                                   pooling_type,
+                                   global_pooling,
+                                   adaptive,
+                                   padding_algorithm,
+                                   out);
+  }
+}
+
+template <typename T, typename Context>
+void Pool3dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx) {
+  PoolGradRawGPUDNNKernel<T, Context>(ctx,
+                                      x,
+                                      out,
+                                      dout,
+                                      kernel_size,
+                                      strides,
+                                      paddings,
+                                      exclusive,
+                                      data_format,
+                                      pooling_type,
+                                      global_pooling,
+                                      adaptive,
+                                      padding_algorithm,
+                                      dx);
+}
+
+}  // namespace phi
+
+using phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(pool2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool2d_double_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dDoubleGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d8f965667758b9118635e3c8db4be74f9ff54a6a
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -0,0 +1,312 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolRawGPUDNNKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const std::vector<int>& kernel_size,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         bool exclusive,
+                         const std::string& data_format,
+                         const std::string& pooling_type,
+                         bool global_pooling,
+                         bool adaptive,
+                         const std::string& padding_algorithm,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(ctx.GetPlace()),
+      true,
+      errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace "
+                              "rather than CPUPlace."));
+
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  ctx.template Alloc<T>(output);
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings_
+  auto x_dims = input->dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+  const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+
+  // -----------------transformed tensor ------------------------
+
+  DenseTensor transformed_input(input->type());
+  DenseTensor transformed_output(output->type());
+  GPUDNNDataLayout layout;
+
+  if (data_format == str_NDHWC) {
+    layout = GPUDNNDataLayout::kNCDHW;
+    std::vector<int> axis{0, 4, 1, 2, 3};
+
+    // input
+    transformed_input.Resize(input->dims());
+
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 5> trans5;
+    trans5(ctx, *input, &transformed_input, axis);
+
+    // output
+    transformed_output.Resize(output->dims());
+
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[4];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    out_dims_vec[4] = output->dims()[3];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN not support NHWC data layout
+  } else if (data_format == str_NHWC) {
+    layout = GPUDNNDataLayout::kNCHW;
+
+    std::vector<int> axis{0, 3, 1, 2};
+
+    transformed_input.Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input.Resize(make_ddim(in_dims_vec));
+    ctx.Alloc(&transformed_input, input->type());
+
+    funcs::Transpose<Context, T, 4> trans;
+    trans(ctx, *input, &transformed_input, axis);
+
+    transformed_output.Resize(output->dims());
+    auto out_dims_vec = vectorize(output->dims());
+    out_dims_vec[1] = output->dims()[3];
+    out_dims_vec[2] = output->dims()[1];
+    out_dims_vec[3] = output->dims()[2];
+    transformed_output.Resize(make_ddim(out_dims_vec));
+#endif
+  } else {
+    layout = GetLayoutFromStr(data_format);
+    transformed_input = *input;
+    transformed_output = *output;
+  }
+
+  const T* tranformed_input_data = transformed_input.data<T>();
+  T* tranformed_output_data = ctx.template Alloc<T>(&transformed_output);
+
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor input_desc;
+  ScopedTensorDescriptor output_desc;
+  ScopedPoolingDescriptor pool_desc;
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#else
+  cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_input.dims()));
+  cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+      layout, vectorize<int>(transformed_output.dims()));
+#endif
+  PoolingMode pooling_mode;
+  if (pooling_type == "max") {
+    pooling_mode = PoolingMode::kMaximum;
+  } else {
+    pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                             : PoolingMode::kAverageInclusive;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#else
+  cudnnPoolingDescriptor_t cudnn_pool_desc =
+      pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides);
+#endif
+
+  // ------------------- cudnn pool algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+
+#ifdef PADDLE_WITH_HIP
+  char* pool_workspace;
+  size_t pool_workernel_size_ = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2(
+      cudnn_pool_desc, cudnn_output_desc, &pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::miopenPoolingForward(handle,
+                                    cudnn_pool_desc,
+                                    &alpha,
+                                    cudnn_input_desc,
+                                    tranformed_input_data,
+                                    &beta,
+                                    cudnn_output_desc,
+                                    tranformed_output_data,
+                                    false,
+                                    pool_workspace,
+                                    pool_workernel_size_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cudnnPoolingForward(handle,
+                                   cudnn_pool_desc,
+                                   &alpha,
+                                   cudnn_input_desc,
+                                   tranformed_input_data,
+                                   &beta,
+                                   cudnn_output_desc,
+                                   tranformed_output_data));
+#endif
+  // add
+  if (data_format == str_NDHWC) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<Context, T, 5> trans5_v2;
+    trans5_v2(ctx, transformed_output, output, axis);
+  }
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN not support NHWC data layout
+  if (data_format == str_NHWC) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<Context, T, 4> trans;
+    trans(ctx, transformed_output, output, axis);
+  }
+#endif
+}
+
+template <typename T, typename Context>
+void Pool2dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out) {
+  PoolRawGPUDNNKernel<T, Context>(ctx,
+                                  x,
+                                  kernel_size,
+                                  strides,
+                                  paddings,
+                                  exclusive,
+                                  data_format,
+                                  pooling_type,
+                                  global_pooling,
+                                  adaptive,
+                                  padding_algorithm,
+                                  out);
+}
+
+template <typename T, typename Context>
+void Pool3dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out) {
+  PoolRawGPUDNNKernel<T, Context>(ctx,
+                                  x,
+                                  kernel_size,
+                                  strides,
+                                  paddings,
+                                  exclusive,
+                                  data_format,
+                                  pooling_type,
+                                  global_pooling,
+                                  adaptive,
+                                  padding_algorithm,
+                                  out);
+}
+
+}  // namespace phi
+
+using phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {}
+PD_REGISTER_KERNEL(
+    pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {}
+#else
+PD_REGISTER_KERNEL(pool2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool2dGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(pool3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Pool3dGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index c9c549379bbce6ceb4c2314f34f24ad659f5c272..2b2dd5118969cf35c4762f3ab774ce41c04d2e4d 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -79,7 +79,7 @@ class VecT2<phi::dtype::bfloat16> {
   using Type = int;
 };
 
-static inline int log2_ceil(int value) {
+static inline int Log2Ceil(int value) {
   int log2_value = 0;
   while ((1 << log2_value) < value) ++log2_value;
   return log2_value;
@@ -351,8 +351,17 @@ __global__ void WarpSoftmaxForward(T* softmax,
     VecT* softmax_v =
         reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnaryLogFunctor<AccT>>(
+          &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor<AccT>());
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &srcdata[i][0][0],
+          UnarySubFunctor<AccT>(std::log(sum[i])));
+    } else {
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
+          &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    }
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
@@ -434,15 +443,25 @@ __global__ void WarpSoftmaxBackward(T* dst,
   AccT sum_tmp[kBatchSize][kLoopsV][kVSize];
   AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[0][0][0]);
   AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
-  kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
-      &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
-  kps::Reduce<AccT,
-              kVItem,
-              kBatchSize,
-              1,
-              kps::AddFunctor<AccT>,
-              kps::details::ReduceMode::kLocalMode>(
-      &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  if (LogMode) {
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &grad_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  } else {
+    kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
+        &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  }
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
   // write result to global memory
@@ -453,10 +472,23 @@ __global__ void WarpSoftmaxBackward(T* dst,
     if (i >= local_batches) break;
     AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
     AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
-        &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
-    kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor<AccT>());
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpMulFunctor<AccT>>(
+          &out[i][0][0], &srcptr[0], ExpMulFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::SubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &gradptr[0],
+          &out[i][0][0],
+          kps::SubFunctor<AccT>());
+    } else {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &srcptr[0],
+          &out[i][0][0],
+          kps::MulFunctor<AccT>());
+    }
     VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
@@ -577,8 +609,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
 #else
   constexpr int max_num_threads = 1024;
 #endif
-  int block_x = 1 << log2_ceil(low_dim);
-  int block_y = 1 << log2_ceil(mid_dim);
+  int block_x = 1 << Log2Ceil(low_dim);
+  int block_y = 1 << Log2Ceil(mid_dim);
   block->x = std::min(block_x, 32);
   block->y = std::min(block_y, static_cast<int>(max_num_threads / block->x));
   block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
@@ -639,7 +671,8 @@ __global__ void NormalSoftmaxForward(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor>
+          template <typename, typename> class Functor,
+          bool LogMode>
 __global__ void NormalSoftmaxBackward(T* input_grad,
                                       const T* output_grad,
                                       const T* output,
@@ -656,10 +689,17 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
 
       // 1. reduce sum
       AccT sum = 0;
-      for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
-        int data_offset = grad_offset + mid_id * mid_stride;
-        sum += static_cast<AccT>(output_grad[data_offset]) *
-               static_cast<AccT>(output[data_offset]);
+      if (LogMode) {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]);
+        }
+      } else {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]) *
+                 static_cast<AccT>(output[data_offset]);
+        }
       }
       if (blockDim.y > 1) {
         kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
@@ -715,10 +755,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        LogSoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          LogSoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
@@ -726,10 +766,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
         mid_dim,
         low_dim);
   } else {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          SoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
@@ -739,6 +779,157 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   }
 }
 
+static std::vector<int> GetSoftmaxTensorDims(const phi::DDim& dims,
+                                             const int axis) {
+  int dim = dims[axis];
+  int N = phi::funcs::SizeToAxis(axis, dims);
+  int D = phi::funcs::SizeOutAxis(axis, dims);
+  return {N, dim, D, 1};
+}
+
+template <typename T>
+void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
+                               const DenseTensor& x,
+                               const int axis,
+                               const bool log_mode,
+                               DenseTensor* out) {
+  auto* out_data = out->data<T>();
+
+  const int rank = x.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2(
+      handle,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data,
+      algo,
+      mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      x.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      out_data));
+#endif
+}
+
+template <typename T>
+void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
+                                const DenseTensor& out,
+                                const DenseTensor& dout,
+                                const int axis,
+                                const bool log_mode,
+                                DenseTensor* dx) {
+  auto* dx_data = dx->data<T>();
+
+  int rank = out.dims().size();
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+
+  auto handle = dev_ctx.cudnn_handle();
+  GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
+
+  ScopedTensorDescriptor scoped_desc;
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t desc =
+      scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                               : MIOPEN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSoftmaxBackward_V2(
+          handle,
+          paddle::platform::CudnnDataType<T>::kOne(),
+          desc,
+          out.data<T>(),
+          desc,
+          dout.data<T>(),
+          paddle::platform::CudnnDataType<T>::kZero(),
+          desc,
+          dx_data,
+          algo,
+          mode));
+#else
+  cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
+  auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                               : CUDNN_SOFTMAX_MODE_CHANNEL;
+  auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward(
+      handle,
+      algo,
+      mode,
+      paddle::platform::CudnnDataType<T>::kOne(),
+      desc,
+      out.data<T>(),
+      desc,
+      dout.data<T>(),
+      paddle::platform::CudnnDataType<T>::kZero(),
+      desc,
+      dx_data));
+#endif
+}
+
+template <typename T>
+static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
+  if (dev_ctx.cudnn_handle() != nullptr) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      return false;
+#endif
+    }
+    return true;
+  }
+  return false;
+}
+
+#if CUDNN_VERSION < 8100
+template <>
+inline void SoftmaxForwardCudnnKernel<phi::dtype::bfloat16>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& x,
+    const int axis,
+    const bool log_mode,
+    DenseTensor* out) {
+  PADDLE_THROW(errors::Unavailable(
+      "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < "
+      "8100."));
+}
+template <>
+inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
+    const GPUContext& dev_ctx,
+    const DenseTensor& out,
+    const DenseTensor& dout,
+    const int axis,
+    const bool log_mode,
+    DenseTensor* dx) {
+  PADDLE_THROW(errors::Unavailable(
+      "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < "
+      "8100."));
+}
+#endif
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -746,29 +937,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     DenseTensor* out) {
   auto* out_data = out->data<T>();
 
-  auto dims = x.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = x.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = static_cast<int>(log2_ceil(dim));
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = static_cast<int>(Log2Ceil(dim));
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
 
     // use 128 threads per block to maximimize gpu utilization
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -783,7 +974,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
                                                threads,
@@ -793,7 +984,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     } else {
       SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
                                               threads,
@@ -803,78 +994,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                               N,
                                               dim,
                                               dim,
-                                              kDimLog2);
+                                              dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxForward<T, LogMode>(
         dev_ctx, out_data, x.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxForward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              x.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              out_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_LOG,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
-          handle,
-          CUDNN_SOFTMAX_ACCURATE,
-          mode,
-          paddle::platform::CudnnDataType<T>::kOne(),
-          desc_,
-          x.data<T>(),
-          paddle::platform::CudnnDataType<T>::kZero(),
-          desc_,
-          out_data));
-    }
-#endif
+    SoftmaxForwardCudnnKernel<T>(dev_ctx, x, axis, LogMode, out);
   }
 }
 
@@ -886,27 +1012,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                      DenseTensor* dx) {
   auto* dx_data = dx->data<T>();
 
-  auto dims = out.dims();
-  const int rank = dims.size();
-  const int axis = phi::funcs::CanonicalAxis(input_axis, rank);
-  const int dim = dims[axis];
-  const int N = phi::funcs::SizeToAxis(axis, dims);
-  const int D = phi::funcs::SizeOutAxis(axis, dims);
+  int rank = out.dims().size();
+  int axis = phi::funcs::CanonicalAxis(input_axis, rank);
+  std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
+  int N = tensor_dims[0];
+  int dim = tensor_dims[1];
+  int D = tensor_dims[2];
 
   constexpr int max_dim = 512;
-  constexpr int warps_per_block = 4;
 
-  if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-    const int kDimLog2 = log2_ceil(dim);
-    const int kDimCeil = 1 << kDimLog2;
-    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-    int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+  if (D == 1 &&
+      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+    int dim_log2 = Log2Ceil(dim);
+    int dim_ceil = 1 << dim_log2;
+    int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
+    int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
+
     constexpr int threads_per_block = 128;
 
-    int warps_per_block = (threads_per_block / kWarpSize);
+    int warps_per_block = (threads_per_block / warp_size);
     int batches_per_block = warps_per_block * batches_per_warp;
     int blocks = (N + batches_per_block - 1) / batches_per_block;
-    dim3 threads(kWarpSize, warps_per_block, 1);
+    dim3 threads(warp_size, warps_per_block, 1);
 
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
@@ -921,7 +1048,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else if (dim % 2 == 0) {
       SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
                                                 threads,
@@ -932,7 +1059,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                 N,
                                                 dim,
                                                 dim,
-                                                kDimLog2);
+                                                dim_log2);
     } else {
       SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
                                                threads,
@@ -943,88 +1070,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                                N,
                                                dim,
                                                dim,
-                                               kDimLog2);
+                                               dim_log2);
     }
   } else if (D > 1) {
     LaunchNormalSoftmaxBackward<T, LogMode>(
         dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = dev_ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_LOG,
-              mode));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenSoftmaxBackward_V2(
-              handle,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data,
-              MIOPEN_SOFTMAX_ACCURATE,
-              mode));
-    }
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    if (LogMode) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_LOG,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnSoftmaxBackward(
-              handle,
-              CUDNN_SOFTMAX_ACCURATE,
-              mode,
-              paddle::platform::CudnnDataType<T>::kOne(),
-              desc_,
-              out.data<T>(),
-              desc_,
-              dout.data<T>(),
-              paddle::platform::CudnnDataType<T>::kZero(),
-              desc_,
-              dx_data));
-    }
-#endif
+    SoftmaxBackwardCudnnKernel<T>(dev_ctx, out, dout, axis, LogMode, dx);
   }
 }
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
similarity index 83%
rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 7685c7dbb6894b4e640ea4b63010c4d22fc5e18f..37175c427ffe142c31b41c8356d160d203fd6d73 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -21,10 +21,10 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            int axis,
-                            DenseTensor* out) {
+void SoftmaxGPUDNNKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         int axis,
+                         DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
   SoftmaxForwardCUDAKernelDriver<T>(dev_ctx, x, axis, out);
 }
@@ -35,7 +35,7 @@ void SoftmaxRawGPUDNNKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
@@ -44,7 +44,7 @@ PD_REGISTER_KERNEL(softmax,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16,
@@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(softmax,
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
-                   phi::SoftmaxRawGPUDNNKernel,
+                   phi::SoftmaxGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d163e6e278a0755c63be3192ae749ddbac5262d4
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..95dbdc4443ad003c3b9e7e584976cbe5c0503e20
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a48a6226f23f8d9976dc86e59b051828b1d71b21
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -0,0 +1,205 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradImpl(const Context& dev_ctx,
+                        const DenseTensor* X,
+                        const DenseTensor* Out,
+                        const DenseTensor* dOut,
+                        DenseTensor* dX,
+                        const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      dOut, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      dX, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!Out) {
+    Out = dOut;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = dX;
+  }
+
+  dev_ctx.template Alloc<T>(dX);
+  auto dout = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad"));
+  auto dx = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad"));
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place,
+            To32BitIndex(x),
+            To32BitIndex(out),
+            To32BitIndex(dout),
+            To32BitIndex(dx));
+  } else {
+    functor(*place, x, out, dout, dx);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+void ActivationDoubleGradImpl(const Context& dev_ctx,
+                              const DenseTensor* X,
+                              const DenseTensor* Out,
+                              const DenseTensor* ddX,
+                              DenseTensor* dX,
+                              DenseTensor* dOut,
+                              DenseTensor* ddOut,
+                              const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = ddX;
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    Out = ddX;
+  }
+
+  if (ddOut) {
+    dev_ctx.template Alloc<T>(ddOut);
+  }
+  if (dOut) {
+    dev_ctx.template Alloc<T>(dOut);
+  }
+  if (dX) {
+    dX->Resize(Out->dims());
+    dev_ctx.template Alloc<T>(dX);
+  }
+
+  functor(dev_ctx, X, Out, ddX, ddOut, dOut, dX);
+}
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout) {
+  funcs::ReluGradGradFunctor<T> relu_double_grad_functor;
+  ActivationDoubleGradImpl<T, Context, funcs::ReluGradGradFunctor<T>>(
+      dev_ctx,
+      nullptr,
+      &out,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      relu_double_grad_functor);
+}
+
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout) {
+  funcs::LeakyReluGradGradFunctor<T> leaky_relu_double_grad_functor;
+  leaky_relu_double_grad_functor.alpha = alpha;
+  ActivationDoubleGradImpl<T, Context, funcs::LeakyReluGradGradFunctor<T>>(
+      dev_ctx,
+      &x,
+      nullptr,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      leaky_relu_double_grad_functor);
+}
+
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::TanhGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::TanhTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,  // input
+          d_dout,
+          d_out_new,
+          d_ddx);  // output
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca3debd394a1e518b3a37d4d08e8a7f47a5670f2
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+#define ToString(x) #x
+
+template <typename T, typename Context, typename Functor>
+void ActivationImpl(const Context& dev_ctx,
+                    const DenseTensor& X,
+                    DenseTensor* Out,
+                    const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(Out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(Out);
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&X, "Input", "X", "Activation"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Output", "Out", "Activation"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place, To32BitIndex(x), To32BitIndex(out));
+  } else {
+    functor(*place, x, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fbdf435bab39fbded100656a591fc76ea2ca69b
--- /dev/null
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(avg_squared_grad_out);
+  dev_ctx.template Alloc<T>(avg_squared_update_out);
+
+  T rho_ = static_cast<T>(rho);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  // Squared gradient accumulator
+  auto eigen_avg_squared_grad = EigenVector<T>::Flatten(avg_squared_grad);
+  // Squared updates accumulator
+  auto eigen_avg_squared_update = EigenVector<T>::Flatten(avg_squared_update);
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_avg_squared_grad_out =
+      EigenVector<T>::Flatten(*avg_squared_grad_out);
+  auto eigen_avg_squared_update_out =
+      EigenVector<T>::Flatten(*avg_squared_update_out);
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_avg_squared_grad_out.device(place) =
+      rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square();
+  auto update = -((eigen_avg_squared_update + epsilon_) /
+                  (eigen_avg_squared_grad_out + epsilon_))
+                     .sqrt() *
+                eigen_grad;
+  eigen_avg_squared_update_out.device(place) =
+      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
+  eigen_param_out.device(place) = eigen_param + update;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adamax_kernel_impl.h b/paddle/phi/kernels/impl/adamax_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bff553319a2b98814d31a3f992b98541498de149
--- /dev/null
+++ b/paddle/phi/kernels/impl/adamax_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(moment_out);
+  dev_ctx.template Alloc<T>(inf_norm_out);
+
+  T beta1_ = static_cast<T>(beta1);
+  T beta2_ = static_cast<T>(beta2);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  auto eigen_moment = EigenVector<T>::Flatten(moment);
+  auto eigen_inf_norm = EigenVector<T>::Flatten(inf_norm);
+  auto eigen_lr = EigenVector<T>::Flatten(learning_rate);
+  auto eigen_beta1_pow = EigenVector<T>::Flatten(beta1_pow);
+
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_moment_out = EigenVector<T>::Flatten(*moment_out);
+  auto eigen_inf_norm_out = EigenVector<T>::Flatten(*inf_norm_out);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_moment_out.device(place) =
+      beta1_ * eigen_moment + (1 - beta1_) * eigen_grad;
+  eigen_inf_norm_out.device(place) =
+      eigen_grad.abs().cwiseMax((beta2_ * eigen_inf_norm) + epsilon_);
+  auto lr_t = eigen_lr / (1 - eigen_beta1_pow);
+  Eigen::DSizes<int, 1> m_dsize(moment_out->numel());
+  eigen_param_out.device(place) =
+      eigen_param -
+      lr_t.broadcast(m_dsize) * (eigen_moment_out / eigen_inf_norm_out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index eb01b83377cb62c7dc6147cd57edcd3c9c047f78..d7167704a4824b74572bc0e0dd53d7a5c3dbd8c7 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -23,10 +23,10 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#define SWITCH_OUT_RANK_CASE(n)                                         \
-  case n: {                                                             \
-    ApplyBroadcast<T, Context, n>(ctx, &in_tensors[i], out_tensors[i]); \
-    break;                                                              \
+#define SWITCH_OUT_RANK_CASE(n)                                        \
+  case n: {                                                            \
+    ApplyBroadcast<T, Context, n>(ctx, in_tensors[i], out_tensors[i]); \
+    break;                                                             \
   }
 
 namespace phi {
@@ -75,7 +75,7 @@ void ApplyBroadcast(const Context& ctx,
 
 template <typename T, typename Context>
 void BroadcastTensorsKernel(const Context& ctx,
-                            const std::vector<DenseTensor>& x,
+                            const std::vector<const DenseTensor*>& x,
                             std::vector<DenseTensor*> out) {
   const auto& in_tensors = x;
   auto out_tensors = out;
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f557e746378939e32a32955e758cdc5c510f229
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskySolveGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             bool upper,
+                             DenseTensor* dx,
+                             DenseTensor* dy) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  // Tensor broadcast to temp 'y_bst'
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // reuse forward to calculate dx_bst, which is broad_cast of dx
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  CholeskySolveKernel<T, Context>(dev_ctx, dout, y_bst, upper, &dx_bst);
+
+  // get 'dx' according to 'dx_bst'
+  dx->Resize(x.dims());
+  dev_ctx.template Alloc<T>(dx);
+  if (dx_bst.dims() == x.dims()) {
+    Copy<Context>(dev_ctx, dx_bst, dev_ctx.GetPlace(), false, dx);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dx_bst, dx);
+    dx->Resize(x.dims());
+  }
+
+  // calculate out's conjugate for complex
+  DenseTensor out_conj = Conj<T, Context>(dev_ctx, out);
+  out_conj = phi::TransposeLast2Dim<T>(dev_ctx, out_conj);
+
+  DenseTensor commonterm = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(dx_bst,
+              phi::funcs::CreateMatrixDescriptor(dx_bst.dims(), 0, false),
+              out_conj,
+              phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false),
+              static_cast<T>(1),
+              &commonterm,
+              static_cast<T>(0));
+
+  // calculate commonterm's conjugate for complex
+  DenseTensor commonterm_conj = Conj<T, Context>(dev_ctx, commonterm);
+  commonterm_conj = phi::TransposeLast2Dim<T>(dev_ctx, commonterm_conj);
+
+  phi::AddRawKernel<T>(dev_ctx, commonterm, commonterm_conj, -1, &commonterm);
+
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  if (upper) {
+    blas.MatMul(y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  } else {
+    blas.MatMul(commonterm,
+                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
+                y_bst,
+                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
+                static_cast<T>(-1),
+                &dy_bst,
+                static_cast<T>(0));
+  }
+
+  // get upper or lower of 'dy_bst'
+  DenseTensor dy_bst_upper = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+
+  int y_bst_ndim = y_bst_dims_vec.size();
+  const auto H = y_bst_dims_vec[y_bst_ndim - 2];
+  const auto W = y_bst_dims_vec[y_bst_ndim - 1];
+  phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
+  paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+      dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
+  y_for_range(tril_triu_functor);
+
+  // get 'dy' according to 'dy_bst'
+  dy->Resize(y.dims());
+  dev_ctx.template Alloc<T>(dy);
+  if (dy_bst_upper.dims() == y.dims()) {
+    Copy<Context>(dev_ctx, dy_bst_upper, dev_ctx.GetPlace(), false, dy);
+  } else {
+    funcs::MatrixReduceSumFunctor<T, Context> functor;
+    functor(dev_ctx, dy_bst_upper, dy);
+    dy->Resize(y.dims());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..16ceb776f1a98a0a7050d6cc9f3ec17be7bb617f
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_solve_kernel.h"
+
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+class CholeskySolveFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  bool upper,
+                  int M,
+                  int N,
+                  T* Adata,
+                  int lda,
+                  T* Bdata,
+                  int* devInfo);
+};
+
+template <typename T, typename Context>
+void CholeskySolveKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         bool upper,
+                         DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+
+  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
+
+  // Tensor broadcast to temp 'x_bst' and 'y_bst'
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  // calculate y_bst's conjugate for complex
+  DenseTensor y_bst_conj = Conj<T, Context>(dev_ctx, y_bst);
+  y_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, y_bst_conj);
+  T* y_bst_conj_data = y_bst_conj.data<T>();
+
+  // calculate x_bst's conjugate for complex
+  DenseTensor x_bst_conj = Conj<T, Context>(dev_ctx, x_bst);
+  x_bst_conj = phi::TransposeLast2Dim<T>(dev_ctx, x_bst_conj);
+
+  // copy x_bst's conjugate to 'result'
+  DenseTensor result;
+  Copy<Context>(dev_ctx, x_bst_conj, dev_ctx.GetPlace(), false, &result);
+  T* res_data = result.data<T>();
+
+  // CPU use lapack, GPU use cusolver
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int M = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 2]);
+  int N = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 1]);
+  int batchsize = product(phi::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2));
+
+  DenseTensor info =
+      phi::Empty<int, Context>(dev_ctx, ScalarArray({batchsize}));
+  int* info_data = info.data<int>();
+
+  CholeskySolveFunctor<T, Context> functor;
+  for (int i = 0; i < batchsize; ++i) {
+    functor(dev_ctx,
+            upper,
+            M,
+            N,
+            y_bst_conj_data + i * M * M,
+            std::max(1, M),
+            res_data + i * M * N,
+            info_data + i);
+  }
+
+  // calculate out's conjugate for complex
+  result = phi::TransposeLast2Dim<T>(dev_ctx, result);
+  out->Resize(phi::make_ddim(x_bst_dims_vec));
+  ConjKernel<T, Context>(dev_ctx, result, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4390c1f8e661c2644b2f71d5bef1dbfea9af487f
--- /dev/null
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/compare_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+
+namespace phi {
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void CompareKernelImpl(const Context& ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              int axis,
+                              DenseTensor* out);
+
+template <typename T, typename Context, typename Functor>
+inline void CompareAllKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out);
+
+#define DEFINE_COMPARE_KERNEL(compare_kernel, functor, inverse_functor) \
+  template <typename T, typename Context>                               \
+  void compare_kernel(const Context& ctx,                               \
+                      const DenseTensor& x,                             \
+                      const DenseTensor& y,                             \
+                      int axis,                                         \
+                      DenseTensor* out) {                               \
+    CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>(      \
+        ctx, x, y, axis, out);                                          \
+  }
+
+DEFINE_COMPARE_KERNEL(LessThanKernel,
+                      funcs::LessThanFunctor,
+                      funcs::GreaterThanFunctor)
+DEFINE_COMPARE_KERNEL(LessEqualKernel,
+                      funcs::LessEqualFunctor,
+                      funcs::GreaterEqualFunctor)
+DEFINE_COMPARE_KERNEL(GreaterThanKernel,
+                      funcs::GreaterThanFunctor,
+                      funcs::LessThanFunctor)
+DEFINE_COMPARE_KERNEL(GreaterEqualKernel,
+                      funcs::GreaterEqualFunctor,
+                      funcs::LessEqualFunctor)
+DEFINE_COMPARE_KERNEL(EqualKernel, funcs::EqualFunctor, funcs::EqualFunctor)
+DEFINE_COMPARE_KERNEL(NotEqualKernel,
+                      funcs::NotEqualFunctor,
+                      funcs::NotEqualFunctor)
+#undef DEFINE_COMPARE_KERNEL
+
+#define DEFINE_COMPARE_ALL_KERNEL(compare_all_kernel, functor)    \
+  template <typename T, typename Context>                         \
+  void compare_all_kernel(const Context& ctx,                     \
+                          const DenseTensor& x,                   \
+                          const DenseTensor& y,                   \
+                          DenseTensor* out) {                     \
+    CompareAllKernelImpl<T, Context, functor<T>>(ctx, x, y, out); \
+  }
+
+DEFINE_COMPARE_ALL_KERNEL(EqualAllKernel, funcs::EqualFunctor)
+#undef DEFINE_COMPARE_ALL_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..93bc5b64adc170901aeffeadfa64d6b5d7ea8c60
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+
+DECLARE_bool(cudnn_deterministic);
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+
+namespace phi {
+
+static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) {
+  return dev_ctx.GetComputeCapability() >= 70;
+}
+
+// inline cudnnTensorFormat_t GetCudnnTensorFormat(
+//     const phi::DataLayout& order) {  // Not use
+//   switch (order) {
+//     case phi::DataLayout::kNHWC:
+//       return CUDNN_TENSOR_NHWC;
+//     case phi::DataLayout::kNCHW:
+//       return CUDNN_TENSOR_NCHW;
+//     case phi::DataLayout::NCDHW:
+//       return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+//     case phi::DataLayout::NDHWC:
+//       return CUDNN_TENSOR_NHWC;  // add, liyamei
+//     default:
+//       PADDLE_THROW(phi::errors::Unimplemented(
+//           "CUDNN has no equivalent dataLayout for input order."));
+//   }
+//   return CUDNN_TENSOR_NCHW;
+// }
+
+static inline void GetNCDHW(const DDim& dims,
+                            const phi::DataLayout& layout,
+                            int* N,
+                            int* C,
+                            int* D,
+                            int* H,
+                            int* W) {
+  *N = dims[0];
+  *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+  int i = layout == phi::DataLayout::kNCHW ? 0 : 1;
+  if (dims.size() == 5) {
+    *D = dims[2 - i];
+    *H = dims[3 - i];
+    *W = dims[4 - i];
+  } else {
+    *D = 1;
+    *H = dims[2 - i];
+    *W = dims[3 - i];
+  }
+}
+
+}  // namespace phi
+
+// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double
+// ) {}
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbcebf371a61bd3d652888b5eaad56185499726b
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -0,0 +1,330 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides_t,
+                        const std::vector<int>& paddings_t,
+                        const std::string& padding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations_t,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad) {
+  const DenseTensor* X = &input;
+  const DenseTensor* dY = &out_grad;
+  const DenseTensor* ddX = input_grad_grad.get_ptr();
+  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
+
+  DenseTensor* ddY = out_grad_grad;
+  DenseTensor* dW = filter_grad;
+  DenseTensor* dX = input_grad;
+  DenseTensor W = filter;
+
+  if (!ddY && !dW && !dX) return;
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensor
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_dY(dY->type());
+  DenseTensor transformed_ddX(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+    }
+  } else {
+    transformed_X = *X;
+    transformed_dY = *dY;
+    if (ddX) {
+      transformed_ddX = *ddX;
+    }
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_X.dims();
+  auto filter_dims = W.dims();
+
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
+  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  // col_shape [in_channel/group, kh, kw, oh, ow]
+  col_shape_vec[0] = transformed_X.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+  // input_shape [Cin, H, W]
+  DDim input_shape =
+      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
+  // filter_matrix_shape [Cout, Cin * kh * kw]
+  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
+
+  W.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_dY.dims()[1],
+      transformed_dY.numel() /
+          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
+  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  // dx convolution double grad:  gemm + col2im(col2vol)
+  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
+  // oH, oW)
+  if (dX && ddW_in) {
+    Tensor ddW;
+    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+    dX->mutable_data<T>(dev_ctx.GetPlace());
+
+    DenseTensor transformed_dX(dX->type());
+
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
+
+    } else {
+      transformed_dX = *dX;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary
+    // because math::matmul will reset dx
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col_matrix.ShareDataWith(dx_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(
+            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &dx_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
+    }
+  }
+
+  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
+  // oH, oW)
+  // dw convolution double grad:  im2col(vol2col) + gemm
+  if (dW && ddX) {
+    dW->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, dW, static_cast<T>(0));
+    DenseTensor dW_arr = *dW;
+    dW_arr.Resize(filter_matrix_shape);
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; ++g) {
+        // im2col
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col.ShareDataWith(ddx_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 ddx_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+        }
+
+        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(
+            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
+      }
+    }
+  }
+
+  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
+  // w/ddw(Cout, Cin, kh, kw)
+  // ddy convolution double grad: im2col(vol2col) + gemm
+  if (ddY) {
+    ddY->mutable_data<T>(dev_ctx.GetPlace());
+
+    DenseTensor transformed_ddY(ddY->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
+    } else {
+      transformed_ddY = *ddY;
+    }
+
+    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor ddy_batch =
+          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
+      for (int g = 0; g < groups; ++g) {
+        // gemm
+        DenseTensor ddy_slice =
+            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        if (ddX) {
+          DenseTensor ddx_batch =
+              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor ddx_slice =
+              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (!is_expand) {
+            col.ShareDataWith(ddx_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   ddx_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+          }
+          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
+        }
+
+        if (ddW_in) {
+          DenseTensor x_batch =
+              transformed_X.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          DenseTensor ddW;
+          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+          if (!is_expand) {
+            col.ShareDataWith(x_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   x_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
+          }
+
+          // gemm
+          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1971aca800b59171a2e741dbebce6d8adaf7899
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -0,0 +1,257 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& output_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter_t,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings_t,
+                    const std::string& padding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations_t,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad) {
+  // The filter and filter_grad will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+
+  if (!input_grad && !filter_grad) return;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  DenseTensor filter = filter_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output_grad(output_grad.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+
+    ResizeToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+    TransToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+  } else {
+    transformed_input = input;
+    transformed_output_grad = output_grad;
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation<int>(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(
+      vectorize(transformed_output_grad.dims()));
+
+  // use col_shape in the im2col calculation
+  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+  // o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = transformed_input.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (i_c/g * k_h * k_w, o_h * o_w)
+  // or
+  // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  DDim input_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+
+  DDim output_matrix_shape = {
+      transformed_output_grad.dims()[1],
+      transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
+                                         transformed_output_grad.dims()[1])};
+
+  // convolution backward input operator:  gemm + col2im(or col2vol)
+  // convolution backward weight operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    DenseTensor transformed_input_grad(input_grad->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, input_grad, &transformed_input_grad);
+
+    } else {
+      transformed_input_grad = *input_grad;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary,
+    // because math::matmul will reset input_grad.
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_grad_batch =
+          transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor filter_slice =
+            filter.Slice(g * out_step, (g + 1) * out_step);
+
+        DenseTensor in_grad_slice =
+            in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col_matrix.ShareDataWith(in_grad_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(filter_slice,
+                    true,
+                    out_grad_slice,
+                    false,
+                    T(1.0),
+                    &col_matrix,
+                    T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &in_grad_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_input_grad, input_grad);
+    }
+  }
+
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    Tensor filter_grad_ = *filter_grad;
+    filter_grad_.Resize(filter_matrix_shape);
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_batch =
+          transformed_input.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // im2col
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 in_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+        }
+
+        // gemm
+        DenseTensor filter_grad_slice =
+            filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(out_grad_slice,
+                    false,
+                    col_matrix,
+                    true,
+                    T(1.0),
+                    &filter_grad_slice,
+                    T(1.0));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1945468f02551b8e348687ae578c9f23a038b8ca
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter_t,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings_t,
+                const std::string& padding_algorithm,
+                int groups,
+                const std::vector<int>& dilations_t,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* output) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  DenseTensor filter = filter_t;
+  // The filter will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  output->mutable_data<T>(dev_ctx.GetPlace());
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output(output->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, output, &transformed_output);
+
+  } else {
+    transformed_input = input;
+    transformed_output = *output;
+  }
+
+  // update padding and dilation
+  auto trans_in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+
+  DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+  // filter_shape_vec:
+  // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+
+  // output_shape_vec:
+  // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_output.dims()));
+
+  // use col_shape in the im2col calculation
+  // col_shape_vec:
+  // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
+  // o_d,o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = trans_in_dims[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size:
+  // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
+  // o_w)
+
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    // col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  DDim in_matrix_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+
+  DDim output_matrix_shape = {
+      transformed_output.dims()[1],
+      transformed_output.numel() /
+          (transformed_output.dims()[0] * transformed_output.dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
+
+  paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+  paddle::operators::math::
+      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          im2col;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor in_batch =
+        transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
+    DenseTensor out_batch =
+        transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        im2col(dev_ctx,
+               in_slice,
+               dilations,
+               strides,
+               std::vector<int>{
+                   paddings[0], paddings[2], paddings[1], paddings[3]},
+               &col);
+
+      } else if (data_dim == 3U) {
+        vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      blas.MatMul(
+          filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0));
+    }
+  }
+  if (channel_last) {
+    TransToChannelLast<Context, T>(dev_ctx, &transformed_output, output);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index 74ded1569eb5804950898bc1b824367b56480cda..92550de1800e11481cc7d7d6323e1fd2d2c28818 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -38,8 +38,8 @@ struct DigammaGradFunctor {
 
 template <typename T, typename Context>
 void DigammaGradKernel(const Context& ctx,
-                       const DenseTensor& out_grad,
                        const DenseTensor& x,
+                       const DenseTensor& out_grad,
                        DenseTensor* x_grad) {
   x_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc118a832dc9f20ea4dba1b08bf845e463bd4135
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
@@ -0,0 +1,223 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistGradFunction(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             float p,
+                             DenseTensor* x_grad,
+                             DenseTensor* y_grad) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto out_dims = out.dims();
+
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+  phi::DDim out_new_dims = GetNewDims(out_dims, Rank);
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, Rank>::From(out, out_new_dims);
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  Eigen::DSizes<int, Rank> out_bcast_dims;
+
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  std::vector<int64_t> new_dims_vec(Rank);
+  for (int i = 0; i < Rank; ++i) {
+    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
+    out_bcast_dims[i] = new_dims_vec[i];
+  }
+  phi::DDim new_dims = phi::make_ddim(new_dims_vec);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto out_grad_t = ETensor<T, Rank>::From(out_grad, out_new_dims);
+  DenseTensor grad;
+  grad.Resize(new_dims);
+  dev_ctx.template Alloc<T>(&grad);
+  auto grad_t = ETensor<T, Rank>::From(grad);
+
+  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
+  auto x_minux_y_abs = x_minux_y.abs();
+  auto sign =
+      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
+      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
+
+  // 1: Lp-norm(z), z = x-y, compute dz
+  if (p == 0) {
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, &grad, static_cast<T>(0));
+  } else if (p == INFINITY || p == -INFINITY) {
+    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
+    // j!=i, or equals to sign(z_i) * dout if j=i.
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  } else {
+    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  }
+
+  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
+  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
+  Eigen::DSizes<int, Rank> reduce_dims;
+  for (int i = 0; i < x_new_dims.size(); ++i) {
+    x_reshape_dims[2 * i] = x_bcast_dims[i];
+    x_reshape_dims[2 * i + 1] = x_new_dims[i];
+    y_reshape_dims[2 * i] = y_bcast_dims[i];
+    y_reshape_dims[2 * i + 1] = y_new_dims[i];
+    reduce_dims[i] = 2 * i;
+  }
+
+  // 2: if x or y is broadcasted in forward function,
+  // the grad need to be sum along the broadcasted dimensions
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    auto x_grad_t = ETensor<T, Rank>::From(*x_grad, x_new_dims);
+    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(x_grad_t.dimensions());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    auto y_grad_t = ETensor<T, Rank>::From(*y_grad, y_new_dims);
+    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
+                                  .sum(reduce_dims)
+                                  .reshape(y_grad_t.dimensions());
+  }
+}
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistGradFunction<Context, T, 1>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 2:
+      DistGradFunction<Context, T, 2>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 3:
+      DistGradFunction<Context, T, 3>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 4:
+      DistGradFunction<Context, T, 4>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 5:
+      DistGradFunction<Context, T, 5>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 6:
+      DistGradFunction<Context, T, 6>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..397fc1b9224339b55304620ea9cf0dded570655f
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_kernel_impl.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistFunction(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         float p,
+                         DenseTensor* out) {
+  if (out) {
+    dev_ctx.template Alloc<T>(out);
+  }
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, 1>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  // p=0 means number of non-zero elements of (x-y)
+  // p=inf means the maximum of |x-y|
+  // p=-inf means the minimum of |x-y|
+  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
+  if (p == 0) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
+            .template cast<T>()
+            .sum();
+  } else if (p == INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .maximum();
+  } else if (p == -INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .minimum();
+  } else {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .pow(p)
+            .sum()
+            .pow(1.0 / p);
+  }
+}
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistFunction<Context, T, 1>(dev_ctx, x, y, p, out);
+      break;
+    case 2:
+      DistFunction<Context, T, 2>(dev_ctx, x, y, p, out);
+      break;
+    case 3:
+      DistFunction<Context, T, 3>(dev_ctx, x, y, p, out);
+      break;
+    case 4:
+      DistFunction<Context, T, 4>(dev_ctx, x, y, p, out);
+      break;
+    case 5:
+      DistFunction<Context, T, 5>(dev_ctx, x, y, p, out);
+      break;
+    case 6:
+      DistFunction<Context, T, 6>(dev_ctx, x, y, p, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b71fd7fa3a5ecd1c864c155df2586d293d3d2e6
--- /dev/null
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  auto& dims = out_v.dims();
+  const int m = dims[dims.size() - 1];
+  DenseTensor tV =
+      phi::TransposeLast2Dim<T>(dev_ctx, phi::Conj<T>(dev_ctx, out_v));
+  DenseTensor W =
+      phi::Subtract<phi::dtype::Real<T>>(dev_ctx,
+                                         phi::funcs::Unsqueeze(out_w, -2),
+                                         phi::funcs::Unsqueeze(out_w, -1));
+  DenseTensor result = phi::Matmul<T>(dev_ctx, tV, dout_v);
+  result.Resize(dims);
+  dev_ctx.template Alloc<T>(&result);
+
+  std::vector<int> out_shape = phi::vectorize<int>(dims);
+  DenseTensor constant;
+  constant.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&constant);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, &constant, T(0.5));
+  result = phi::Subtract<T>(
+      dev_ctx,
+      result,
+      phi::Conj<T>(dev_ctx, phi::TransposeLast2Dim<T>(dev_ctx, result)));
+  result = phi::Multiply<T>(dev_ctx, result, constant);
+  if (result.type() != W.type()) {
+    auto x_vector = EigenVector<T>::Flatten(result);
+    auto y_vector = EigenVector<phi::dtype::Real<T>>::Flatten(W);
+    auto out_vector = EigenVector<T>::Flatten(result);
+    auto& place = *dev_ctx.eigen_device();
+    out_vector.device(place) = x_vector / y_vector;
+  } else {
+    result = phi::Divide<T>(dev_ctx, result, W);
+  }
+  result = phi::funcs::DiagFill<T, phi::dtype::Real<T>>(
+      dev_ctx, m, m, m, 0, dout_w, result);
+  *dx = phi::Matmul<T>(dev_ctx, out_v, phi::Matmul<T>(dev_ctx, result, tV));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index ac7d6fd1a0e9ca18ef528399581a30bfe0be5597..0b7a5d3bcb26a360eb5f7f664ead7932f428cc64 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace phi {
@@ -103,4 +106,526 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
   }
 }
 
+/*
+******************************
+    Divide Grad
+******************************
+*/
+
+template <typename T>
+struct DivGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
+};
+
+template <typename T>
+struct DivGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout / y_conj;
+  }
+};
+
+template <typename T>
+struct DivGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return -dout * out / y;
+  }
+};
+
+template <typename T>
+struct DivGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> out_div_y_conj((out / y).real, -(out / y).imag);
+    return -dout * out_div_y_conj;
+  }
+};
+
+template <typename T>
+struct DivDoubleDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return y * out * dout - x * dout;
+  }
+};
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout) {
+  if (dy) {
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+  }
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  // ddX_safe == null ? 0 : ddX
+  // ddY_safe == null ? 0 : ddY
+  DenseTensor ddX_safe, ddY_safe;
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddY_safe);
+
+  // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+  // dY = Out * dX * ddY / Y - dX * ddX / Y
+  // dOut = - dX * ddY
+  // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
+  // inplace ddx
+  DenseTensor tmp;
+  if (dout) {
+    tmp = *dout;
+  } else {
+    tmp.Resize(out.dims());
+    dev_ctx.template Alloc<T>(&tmp);
+  }
+  if (dy) {
+    // dX_div_Y = dX / Y;
+    DenseTensor dX_div_Y = tmp;
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, dx, y, &dX_div_Y, axis);
+
+    // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
+    // first output tensor is nullptr, the branch to calculate first
+    // output tensor will not be activated, DivGradDx function will not
+    // be called and can be ignored, the first branch has little effect
+    // on running speed.
+
+    // dY = Out * dX * ddY / Y - dX * ddX / Y
+    phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
+        dev_ctx,
+        ddX_safe,
+        ddY_safe,
+        out,
+        dX_div_Y,
+        axis,
+        nullptr,
+        dy,
+        DivGradDX<T>(),
+        DivDoubleDY<T>());
+  }
+
+  if (ddout) {
+    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, out, ddY_safe, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::SubtractFunctor<T>,
+                                      funcs::InverseSubtractFunctor<T>>(
+        dev_ctx, ddX_safe, tmp, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, tmp, y, ddout, axis);
+  }
+
+  if (dout) {
+    // dOut = - dX * ddY
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dx, ddY_safe, dout, axis);
+    auto& place = *dev_ctx.eigen_device();
+    auto dout_result = phi::EigenVector<T>::Flatten(*dout);
+    dout_result.device(place) = static_cast<T>(-1) * dout_result;
+  }
+}
+template <typename T, typename Context>
+void ElementwiseFMaxGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad) {
+  funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+
+  auto out = out_grad;  // Fake out, not used
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<Context,
+                                          T,
+                                          funcs::FMaxGradDx<T>,
+                                          funcs::FMaxGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMaxGradDx<T>(),
+        funcs::FMaxGradDy<T>());
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T,
+                                            funcs::FMaxGradDx<T>,
+                                            funcs::FMaxGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMaxGradDx<T>(),
+        funcs::FMaxGradDy<T>());
+  }
+}
+
+template <typename T, typename Context>
+void ElementwiseFMinGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad) {
+  funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+  auto out = out_grad;  // Fake out, not used
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<Context,
+                                          T,
+                                          funcs::FMinGradDx<T>,
+                                          funcs::FMinGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMinGradDx<T>(),
+        funcs::FMinGradDy<T>());
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T,
+                                            funcs::FMinGradDx<T>,
+                                            funcs::FMinGradDy<T>>(
+        dev_ctx,
+        x_dim,
+        y_dim,
+        x,
+        y,
+        out,
+        out_grad,
+        axis,
+        x_grad,
+        y_grad,
+        funcs::FMinGradDx<T>(),
+        funcs::FMinGradDy<T>());
+  }
+}
+
+template <typename T>
+struct MulGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
+};
+
+template <typename T>
+struct MulGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
+
+/*
+******************************
+    Multiply Grad
+******************************
+*/
+
+template <typename T>
+struct MulGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
+};
+
+template <typename T>
+struct MulGradDY<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout) {
+  if (ddout) dev_ctx.template Alloc<T>(ddout);
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  // dx = dout * ddy
+  // dy = dout * ddx
+  // ddout = ddx * y + x * ddy
+  // change computation sequence to save memory, so ddout can inplace ddx and
+  // dx can be used as 'tmp' tensor
+  // (1) dx = x * ddy
+  // (2) dy = dout * ddx
+  // (3) ddout = ddx * y
+  // (4) ddout = ddout + dx
+  // (5) dx = dout * ddy
+  if (ddout) {
+    auto& place = *dev_ctx.eigen_device();
+    // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
+    if (ddout->numel() > ddx.get_ptr()->numel()) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+
+      DenseTensor ddout_tmp;
+      ddout_tmp.Resize(ddout->dims());
+      dev_ctx.template Alloc<T>(&ddout_tmp);
+
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, y, ddx_safe, ddout, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, x, &ddout_tmp, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+    } else {
+      // use dx to save memory, other than alloc tmp tensor
+      DenseTensor* ddout_tmp = dx;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, x, ddy_safe, ddout_tmp, axis);
+      // NOTE: in the following ElemwiseGradCompute, for the
+      // first output tensor is nullptr, the branch to calculate first
+      // output tensor will not be activated, DivGradDx function will not
+      // be called and can be ignored, the first branch has little effect
+      // on running speed.
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          nullptr,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, y, ddout, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(*ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, dout, ddy_safe, dx, axis);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy) {
+  if (d_x) {
+    d_x->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_x);
+  }
+  if (d_y) {
+    d_y->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_y);
+  }
+  if (d_dout) {
+    d_dout->Resize(dout.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_ddx) {
+    d_ddx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  if (d_ddy) {
+    d_ddy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_ddy);
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  if (d_ddout.get_ptr()) {
+    if (d_x) {
+      // d_x = ddy * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, *(d_ddout.get_ptr()), d_x, axis);
+    }
+    if (d_y) {
+      // d_y = ddx * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, *(d_ddout.get_ptr()), d_y, axis);
+    }
+  }
+
+  if (d_dout) {
+    // get d_dout
+    // d_dout = ddy * d_dx + d_dy * ddx
+    DenseTensor d_dout_tmp;
+    d_dout_tmp.Resize(dout.dims());
+    dev_ctx.template Alloc<T>(&d_dout_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, d_dy, ddx_safe, d_dout, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, ddy_safe, d_dx, &d_dout_tmp, axis);
+    auto d_dout_t = phi::EigenVector<T>::Flatten(*d_dout);
+    auto d_dout_tmp_t = phi::EigenVector<T>::Flatten(d_dout_tmp);
+    d_dout_t.device(place) = d_dout_t + d_dout_tmp_t;
+  }
+
+  if (d_ddx) {
+    // get d_ddx
+    // d_ddx = dout * d_dy + y * d_ddout
+    DenseTensor d_ddx_tmp;
+    d_ddx_tmp.Resize(ddx->dims());
+    dev_ctx.template Alloc<T>(&d_ddx_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dy, d_ddx, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, y, *(d_ddout.get_ptr()), &d_ddx_tmp, axis);
+    auto d_ddx_t = phi::EigenVector<T>::Flatten(*d_ddx);
+    auto d_ddx_tmp_t = phi::EigenVector<T>::Flatten(d_ddx_tmp);
+    d_ddx_t.device(place) = d_ddx_t + d_ddx_tmp_t;
+  }
+
+  if (d_ddy) {
+    // get d_ddy
+    // d_ddy = dout * d_dx + x * d_ddout
+    DenseTensor d_ddy_tmp;
+    d_ddy_tmp.Resize(ddy->dims());
+    dev_ctx.template Alloc<T>(&d_ddy_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dx, d_ddy, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, x, *(d_ddout.get_ptr()), &d_ddy_tmp, axis);
+    auto d_ddy_t = phi::EigenVector<T>::Flatten(*d_ddy);
+    auto d_ddy_tmp_t = phi::EigenVector<T>::Flatten(d_ddy_tmp);
+    d_ddy_t.device(place) = d_ddy_t + d_ddy_tmp_t;
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..775a91bf026d298a61315a7e2d7ebfbe92efb0b5
--- /dev/null
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#endif
+
+namespace phi {
+template <typename T, typename Context>
+void ElementwiseFMaxKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
+      dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out);
+}
+
+template <typename T, typename Context>
+void ElementwiseFMinKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           int axis,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
+      dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_grad_kernel_impl.h b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5908d9d7dcb50d617d6796f26e2e8793d06b8409
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_grad_kernel_impl.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_dx, eigen_x, eigen_dout);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/erf_kernel_impl.h b/paddle/phi/kernels/impl/erf_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa1f4d349ab71ee0b19a3a2d3ad3d3a47553464c
--- /dev/null
+++ b/paddle/phi/kernels/impl/erf_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erf_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenErf<std::decay_t<decltype(place)>, T>::Eval(
+      place, eigen_out, eigen_in);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ef282d470333e8099b668e5dd7d2e4c68beff3e
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+
+namespace phi {
+template <typename Context, typename T, int Dims>
+void ExpandAsBackward(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec,
+                      DenseTensor* in_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  ctx.template Alloc<T>(in_grad);
+  auto x_grad = EigenVector<T>::Flatten(*in_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+  auto out_grad0 = EigenVector<T>::Flatten(out_grad);
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, x_grad, out_grad0, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& context,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad) {
+  auto x_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(x_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    repeat_times[i] = target_shape[i] / vec_in_dims[i];
+  }
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times[i]);
+    reshape_dims_vec.push_back(vec_in_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times.size(); i++) {
+    if (repeat_times[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    context.template Alloc<T>(in_grad);
+    phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
+  } else {
+    PADDLE_ENFORCE_GE(
+        dims,
+        1,
+        errors::InvalidArgument("The rank of the input 'Out@GRAD' for "
+                                "expand_as_v2_grad op must be greater than or "
+                                "equal to 1, but the value received is %d.",
+                                dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for "
+                          "expand_as_v2_grad op must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        ExpandAsBackward<Context, T, 1>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 2:
+        ExpandAsBackward<Context, T, 2>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 3:
+        ExpandAsBackward<Context, T, 3>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 4:
+        ExpandAsBackward<Context, T, 4>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 5:
+        ExpandAsBackward<Context, T, 5>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      case 6:
+        ExpandAsBackward<Context, T, 6>(
+            context, out_grad, reshape_dims_vec, reduce_dims_vec, in_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5138e4e12c05403ceb47371e03e9cbe207bf9a4
--- /dev/null
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void ExpandAs(const Context& context,
+              const DenseTensor& x,
+              const std::vector<int>& target_shape,
+              DenseTensor* out) {
+  auto in_dims = x.dims();
+  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto diff = target_shape.size() - vec_in_dims.size();
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  std::vector<int> repeat_times(vec_in_dims.size());
+  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        target_shape[i],
+        0,
+        errors::InvalidArgument("The value of target shape cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          target_shape[i],
+          0,
+          errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand_as_v2 op.",
+              target_shape[i]));
+      repeat_times[i] = target_shape[i];
+    } else if (target_shape[i] > 0) {
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i],
+            target_shape[i],
+            errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand_as_v2 op.",
+                vec_in_dims[i],
+                target_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = target_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          target_shape[i],
+          -1,
+          errors::InvalidArgument(
+              "When the value in shape is negative for expand_as_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              target_shape[i]));
+      repeat_times[i] = 1;
+    }
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  phi::DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  phi::DDim out_dims = phi::make_ddim(target_shape);
+
+  out->Resize(out_dims);
+  context.template Alloc<T>(out);
+  auto x0 = EigenTensor<T, Rank>::From(x, new_in_dims);
+  auto y = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *context.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+      place, y, x0, bcast_dims);
+}
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    paddle::optional<const DenseTensor&> y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto target_rank = target_shape.size();
+  PADDLE_ENFORCE_GE(target_rank,
+                    rank,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be greater than or equal to "
+                        "the rank (%d) of the input 'x'.",
+                        target_rank,
+                        rank));
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                              "expand_as_v2 op must be positive.",
+                              rank));
+  PADDLE_ENFORCE_LE(target_rank,
+                    MAX_RANK_SUPPORTED,
+                    errors::InvalidArgument(
+                        "The rank (%d) of the input 'target_tensor' for "
+                        "expand_as_v2 op must be less than or equal to %d.",
+                        target_rank,
+                        MAX_RANK_SUPPORTED));
+
+  switch (target_rank) {
+    case 1:
+      ExpandAs<Context, T, 1>(ctx, x, target_shape, out);
+      break;
+    case 2:
+      ExpandAs<Context, T, 2>(ctx, x, target_shape, out);
+      break;
+    case 3:
+      ExpandAs<Context, T, 3>(ctx, x, target_shape, out);
+      break;
+    case 4:
+      ExpandAs<Context, T, 4>(ctx, x, target_shape, out);
+      break;
+    case 5:
+      ExpandAs<Context, T, 5>(ctx, x, target_shape, out);
+      break;
+    case 6:
+      ExpandAs<Context, T, 6>(ctx, x, target_shape, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h
index 453652273a25b2140376712a673713b2f9fbe12b..f4041f921fd352e63688b24d77987eabd526d4a2 100644
--- a/paddle/phi/kernels/impl/eye_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eye_kernel_impl.h
@@ -36,7 +36,7 @@ template <typename T, typename Context>
 void EyeKernel(const Context& ctx,
                int64_t num_rows,
                int64_t num_columns,
-               int dtype,
+               DataType dtype,
                DenseTensor* out) {
   auto num = num_columns;
   if (num == -1) {
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..25247ceaff6c0a5f52a639176ce04c0589cbbd87
--- /dev/null
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(xiongkun): remove the header when decouple the memcpy function in phi.
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+using Tensor = DenseTensor;
+template <typename DeviceContext, typename T>
+struct GetTensorValue {
+  T operator()(const DeviceContext& ctx, const DenseTensor& tensor) const;
+};
+
+template <typename DeviceContext, typename T>
+struct IscloseFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const float rtol,
+                  const float atol,
+                  bool equal_nan,
+                  DenseTensor* output);
+};
+
+template <typename T>
+struct GetTensorValue<phi::CPUContext, T> {
+  T operator()(const phi::CPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    return *(tensor.data<T>());
+  }
+};
+
+template <typename T>
+struct GetTensorValue<phi::GPUContext, T> {
+  T operator()(const phi::GPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place = dev_ctx.GetPlace();
+    paddle::memory::Copy(
+        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
+    return value;
+  }
+};
+
+template <typename T>
+struct IscloseFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    auto* in_a = in.data<T>();
+    auto* in_b = other.data<T>();
+    auto* out_data = ctx.template Alloc<bool>(output);
+    auto num = in.numel();
+    // *out_data = true;
+    for (int i = 0; i < num; i++) {
+      out_data[i] = true;
+    }
+    for (int i = 0; i < num; i++) {
+      const T a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = (a > b ? a - b : b - a);
+        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+        T diff = (left > right ? left - right : right - left);
+        val = a == b || left <= right || diff <= 1e-15;
+      }
+      // *out_data &= val;
+      out_data[i] = val;
+    }
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+__global__ void IscloseCUDAKernel(const T* in_data,
+                                  const T* other_data,
+                                  const double rtol,
+                                  const double atol,
+                                  bool equal_nan,
+                                  int num,
+                                  bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    out_data[i] = val;
+    // if (!val) *out_data = false;
+  }
+}
+
+template <typename T>
+struct IscloseFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    int num = in.numel();
+    const T* in_data = in.data<T>();
+    const T* other_data = other.data<T>();
+    bool* out_data = dev_ctx.template Alloc<bool>(output);
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+    hipMemset(out_data, true, num * sizeof(bool));
+#else
+    cudaMemset(out_data, true, num * sizeof(bool));
+#endif
+    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, other_data, rtol, atol, equal_nan, num, out_data);
+  }
+};
+#endif
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Atol) type must be double"));
+
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Rtol) type must be double"));
+
+  IscloseFunctor<Context, T>()(
+      dev_ctx, x, y, rtol.to<double>(), atol.to<double>(), equal_nan, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..affa85f8a2d28e58335b4dbf6d42114cf47d9594
--- /dev/null
+++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out);
+
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor)            \
+  template <typename T, typename Context>                           \
+  void isfinite_kernel(                                             \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out) { \
+    IsfiniteKernelImpl<T, Context, functor>(ctx, x, out);           \
+  }
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae90960ef4455deff5f9b7e08ed9eb7ff62cba3
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossBackward {
+  HOSTDEVICE KLDivLossBackward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& grad) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return static_cast<T>(-1.) * grad;
+    }
+  }
+};
+
+template <typename T, typename Context>
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x) {
+  auto& place = *dev_ctx.eigen_device();
+  auto* target = &label;
+  auto* input_grad = d_x;
+  auto* loss_grad = &d_out;
+
+  const int n = input_grad->dims()[0];
+  const int numel = input_grad->numel();
+  const int expand = numel / loss_grad->numel();
+
+  dev_ctx.template Alloc<T>(input_grad);
+
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+
+  auto input_grad_t = phi::EigenVector<T>::Flatten(*input_grad);
+  auto loss_grad_t = phi::EigenVector<T>::Flatten(*loss_grad);
+
+  auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
+  auto grad_t = target_t * loss_grad_expand;
+  input_grad_t.device(place) =
+      target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
+
+  if ("mean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
+  } else if ("batchmean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecd23bbfc1c4598c953555a1f0d21dd0f6a989c8
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossForward {
+  HOSTDEVICE KLDivLossForward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& input) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return target * (std::log(target) - input);
+    }
+  }
+};
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out) {
+  auto& place = *(dev_ctx.eigen_device());
+  auto* input = &x;
+  auto* target = &label;
+  auto* loss = out;
+
+  const int n = input->dims()[0];
+  dev_ctx.template Alloc<T>(loss);
+
+  auto input_t = phi::EigenVector<T>::Flatten(*input);
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+  auto loss_t = phi::EigenVector<T>::Flatten(*loss);
+  auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
+  if ("none" == reduction) {
+    loss_t.device(place) = output;
+  } else if ("batchmean" == reduction) {
+    auto output_sum = output.sum();
+    if (n > 0) {
+      loss_t.device(place) = output_sum / output_sum.constant(n);
+    } else {
+      loss_t.device(place) = output_sum;
+    }
+  } else if ("mean" == reduction) {
+    loss_t.device(place) = output.mean();
+  } else if ("sum" == reduction) {
+    loss_t.device(place) = output.sum();
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..30297b53eabb99c4fcccc5c3c7faa04f86d4bb93
--- /dev/null
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -0,0 +1,295 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+struct KronGradElemFunctor {
+  KronGradElemFunctor(const T* dout,
+                      const T* A,
+                      const T* B,
+                      T* dout_a,
+                      T* dout_b,
+                      const int64_t* stride_dout,
+                      const int64_t* stride_a,
+                      const int64_t* stride_b,
+                      const int64_t* shape_b,
+                      const int64_t numel_a,
+                      const int64_t numel_b,
+                      const int ndims)
+      : dout_(dout),
+        A_(A),
+        B_(B),
+        dout_a_(dout_a),
+        dout_b_(dout_b),
+        stride_dout_(stride_dout),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        shape_b_(shape_b),
+        numel_a_(numel_a),
+        numel_b_(numel_b),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_dout_[i];
+      index = index % stride_dout_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+
+    if (dout_a_) {
+      size_t index_out_a = index_a * numel_b_ + index_b;
+      dout_a_[index_out_a] = dout_[idx] * B_[index_b];
+    }
+    if (dout_b_) {
+      size_t index_out_b = index_b * numel_a_ + index_a;
+      dout_b_[index_out_b] = dout_[idx] * A_[index_a];
+    }
+  }
+
+ private:
+  const T* dout_;
+  const T* A_;
+  const T* B_;
+  T* dout_a_;
+  T* dout_b_;
+  const int64_t* stride_dout_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* shape_b_;
+  const int64_t numel_a_;
+  const int64_t numel_b_;
+  const int ndims_;
+};
+
+template <typename T>
+struct KronGradElemFunctor<dtype::complex<T>> {
+  KronGradElemFunctor(const dtype::complex<T>* dout,
+                      const dtype::complex<T>* A,
+                      const dtype::complex<T>* B,
+                      dtype::complex<T>* dout_a,
+                      dtype::complex<T>* dout_b,
+                      const int64_t* stride_dout,
+                      const int64_t* stride_a,
+                      const int64_t* stride_b,
+                      const int64_t* shape_b,
+                      const int64_t numel_a,
+                      const int64_t numel_b,
+                      const int ndims)
+      : dout_(dout),
+        A_(A),
+        B_(B),
+        dout_a_(dout_a),
+        dout_b_(dout_b),
+        stride_dout_(stride_dout),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        shape_b_(shape_b),
+        numel_a_(numel_a),
+        numel_b_(numel_b),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_dout_[i];
+      index = index % stride_dout_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+
+    if (dout_a_) {
+      size_t index_out_a = index_a * numel_b_ + index_b;
+      dout_a_[index_out_a] =
+          dout_[idx] * dtype::complex<T>(B_[index_b].real, -B_[index_b].imag);
+    }
+    if (dout_b_) {
+      size_t index_out_b = index_b * numel_a_ + index_a;
+      dout_b_[index_out_b] =
+          dout_[idx] * dtype::complex<T>(A_[index_a].real, -A_[index_a].imag);
+    }
+  }
+
+ private:
+  const dtype::complex<T>* dout_;
+  const dtype::complex<T>* A_;
+  const dtype::complex<T>* B_;
+  dtype::complex<T>* dout_a_;
+  dtype::complex<T>* dout_b_;
+  const int64_t* stride_dout_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* shape_b_;
+  const int64_t numel_a_;
+  const int64_t numel_b_;
+  const int ndims_;
+};
+
+template <typename Context, typename T>
+struct KronGradOpFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& dout,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* dx,
+                  DenseTensor* dy) {
+    int ndims = dout.dims().size();
+    int64_t numel = dout.numel();
+    int64_t numel_x = x.numel();
+    int64_t numel_y = y.numel();
+
+    const phi::DDim& dim_x = x.dims();
+    const phi::DDim& dim_y = y.dims();
+    const phi::DDim& dim_dout = dout.dims();
+
+    const phi::DDim stride_x = phi::stride(dim_x);
+    const phi::DDim stride_y = phi::stride(dim_y);
+    const phi::DDim stride_dout = phi::stride(dim_dout);
+
+    const int64_t* p_stride_x = nullptr;
+    const int64_t* p_stride_y = nullptr;
+    const int64_t* p_stride_dout = nullptr;
+    const int64_t* p_shape_y = nullptr;
+#if defined(__NVCC__) || defined(__HIPCC__)
+    thrust::device_vector<int64_t> d_stride_x(ndims);
+    thrust::device_vector<int64_t> d_stride_y(ndims);
+    thrust::device_vector<int64_t> d_stride_dout(ndims);
+    thrust::device_vector<int64_t> d_shape_y(ndims);
+    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
+    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
+    thrust::copy(
+        stride_dout.Get(), stride_dout.Get() + ndims, d_stride_dout.begin());
+    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
+
+    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
+    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
+    p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data());
+    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
+#else
+    p_stride_x = stride_x.Get();
+    p_stride_y = stride_y.Get();
+    p_stride_dout = stride_dout.Get();
+    p_shape_y = dim_y.Get();
+#endif
+    // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y)
+    // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x)
+    DenseTensor dout_x;
+    T* p_dout_x = nullptr;
+    if (dx) {
+      dout_x.Resize({numel_x, numel_y});
+      dev_ctx.template Alloc<T>(&dout_x);
+      p_dout_x = dout_x.data<T>();
+    }
+    DenseTensor dout_y;
+    T* p_dout_y = nullptr;
+    if (dy) {
+      dout_y.Resize({numel_y, numel_x});
+      dev_ctx.template Alloc<T>(&dout_y);
+      p_dout_y = dout_y.data<T>();
+    }
+
+    funcs::ForRange<Context> for_range(dev_ctx, numel);
+    KronGradElemFunctor<T> func(dout.data<T>(),
+                                x.data<T>(),
+                                y.data<T>(),
+                                p_dout_x,
+                                p_dout_y,
+                                p_stride_dout,
+                                p_stride_x,
+                                p_stride_y,
+                                p_shape_y,
+                                numel_x,
+                                numel_y,
+                                ndims);
+    for_range(func);
+
+// reduce_sum along aixs 1
+#if defined(__NVCC__) || defined(__HIPCC__)
+    auto stream = dev_ctx.stream();  // it is a cuda device_context
+    if (dx) {
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1});
+    }
+    if (dy) {
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1});
+    }
+#else
+    auto* place = dev_ctx.eigen_device();
+    Eigen::array<int, 1> reduce_dim = {1};
+    if (dx) {
+      auto eigen_dout_x = EigenMatrix<T>::Reshape(dout_x, 1);
+      auto eigen_vec_dx = EigenVector<T>::Flatten(*dx);
+      eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim);
+    }
+    if (dy) {
+      auto eigen_dout_y = EigenMatrix<T>::Reshape(dout_y, 1);
+      auto eigen_vec_dy = EigenVector<T>::Flatten(*dy);
+      eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim);
+    }
+#endif
+  }
+};
+
+template <typename T, typename Context>
+void KronGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  if (x_grad) {
+    ctx.template Alloc<T>(x_grad);
+  }
+  if (y_grad) {
+    ctx.template Alloc<T>(y_grad);
+  }
+
+  int ndims = out_grad.dims().size();
+  DenseTensor xx = UnsqueezeTo(x, ndims);
+  DenseTensor yy = UnsqueezeTo(y, ndims);
+
+  DenseTensor* pdxx = nullptr;
+  DenseTensor* pdyy = nullptr;
+  DenseTensor dxx;
+  DenseTensor dyy;
+  if (x_grad) {
+    dxx = UnsqueezeTo(*x_grad, ndims);
+    pdxx = &dxx;
+  }
+
+  if (y_grad) {
+    dyy = UnsqueezeTo(*y_grad, ndims);
+    pdyy = &dyy;
+  }
+
+  KronGradOpFunctor<Context, T> func;
+  func(ctx, out_grad, xx, yy, pdxx, pdyy);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..47c76f59df23bfee68a2660b76a09df747048378
--- /dev/null
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "thrust/device_vector.h"
+#endif
+
+namespace phi {
+
+inline DenseTensor UnsqueezeTo(const DenseTensor& src, int ndims) {
+  const phi::DDim& shape = src.dims();
+  int rank = shape.size();
+  DenseTensor res;
+  res.ShareDataWith(src);
+  PADDLE_ENFORCE_LE(
+      rank,
+      ndims,
+      errors::InvalidArgument(
+          "The input Tensor's rank should be less than or equal to ndims"
+          "Received input Tensor's rank = %d, ndims = %d",
+          rank,
+          ndims));
+  if (rank < ndims) {
+    std::vector<int64_t> new_dim(ndims, 1);
+    for (int i = ndims - rank; i < ndims; i++) {
+      new_dim[i] = shape[i - ndims + rank];
+    }
+    res.Resize(phi::make_ddim(new_dim));
+  }
+  return res;
+}
+
+template <typename T>
+struct KronElemFunctor {
+  KronElemFunctor(const T* a,
+                  const T* b,
+                  T* out,
+                  const int64_t* shape_b,
+                  const int64_t* stride_a,
+                  const int64_t* stride_b,
+                  const int64_t* stride_out,
+                  int ndims)
+      : a_(a),
+        b_(b),
+        out_(out),
+        shape_b_(shape_b),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        stride_out_(stride_out),
+        ndims_(ndims) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    // it computes 1 element in the output
+    int64_t index = idx;
+    int64_t index_a = 0;
+    int64_t index_b = 0;
+    for (int i = 0; i < ndims_; i++) {
+      auto pos_i = index / stride_out_[i];
+      index = index % stride_out_[i];
+      auto pos_ai = pos_i / shape_b_[i];
+      auto pos_bi = pos_i % shape_b_[i];
+      index_a += stride_a_[i] * pos_ai;
+      index_b += stride_b_[i] * pos_bi;
+    }
+    out_[idx] = a_[index_a] * b_[index_b];
+  }
+
+ private:
+  const T* a_;
+  const T* b_;
+  T* out_;
+  const int64_t* shape_b_;
+  const int64_t* stride_a_;
+  const int64_t* stride_b_;
+  const int64_t* stride_out_;
+  const int ndims_;
+};
+
+template <typename Context, typename T>
+struct KronOpFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+    int ndims = out->dims().size();
+    int64_t numel = out->numel();
+
+    const phi::DDim& dim_x = x.dims();
+    const phi::DDim& dim_y = y.dims();
+    const phi::DDim& dim_out = out->dims();
+    const phi::DDim stride_x = phi::stride(dim_x);
+    const phi::DDim stride_y = phi::stride(dim_y);
+    const phi::DDim stride_out = phi::stride(dim_out);
+
+    const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
+                  *p_stride_out = nullptr, *p_shape_y = nullptr;
+#if defined(__NVCC__) || defined(__HIPCC__)
+    thrust::device_vector<int64_t> d_stride_x(ndims);
+    thrust::device_vector<int64_t> d_stride_y(ndims);
+    thrust::device_vector<int64_t> d_stride_out(ndims);
+    thrust::device_vector<int64_t> d_shape_y(ndims);
+    thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin());
+    thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin());
+    thrust::copy(
+        stride_out.Get(), stride_out.Get() + ndims, d_stride_out.begin());
+    thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin());
+
+    p_stride_x = thrust::raw_pointer_cast(d_stride_x.data());
+    p_stride_y = thrust::raw_pointer_cast(d_stride_y.data());
+    p_stride_out = thrust::raw_pointer_cast(d_stride_out.data());
+    p_shape_y = thrust::raw_pointer_cast(d_shape_y.data());
+#else
+    p_stride_x = stride_x.Get();
+    p_stride_y = stride_y.Get();
+    p_stride_out = stride_out.Get();
+    p_shape_y = dim_y.Get();
+#endif
+
+    funcs::ForRange<Context> for_range(dev_ctx, numel);
+    KronElemFunctor<T> functor(x.data<T>(),
+                               y.data<T>(),
+                               out->data<T>(),
+                               p_shape_y,
+                               p_stride_x,
+                               p_stride_y,
+                               p_stride_out,
+                               ndims);
+    for_range(functor);
+  }
+};
+
+template <typename T, typename Context>
+void KronKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+
+  int ndims = out->dims().size();
+  DenseTensor xx = UnsqueezeTo(x, ndims);
+  DenseTensor yy = UnsqueezeTo(y, ndims);
+
+  KronOpFunctor<Context, T> func;
+  func(ctx, xx, yy, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1b33f5a331ba8add6159d0089ddfa602888bcdf
--- /dev/null
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/funcs/for_range.h"
+namespace phi {
+template <typename T>
+struct LgammaGradFunctor {
+  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f84133d5f483c56d6ba83805b2d2b1499f76712
--- /dev/null
+++ b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       float epsilon,
+                       DenseTensor* in_grad) {
+  auto prediction = EigenVector<T>::Flatten(input);
+  auto label_out = EigenVector<T>::Flatten(label);
+
+  auto dl = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+
+  if (in_grad) {
+    dev_ctx.template Alloc<T>(in_grad);
+    auto dx = EigenVector<T>::Flatten(*in_grad);
+    phi::funcs::EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+        place, dx, dl, prediction, label_out, epsilon);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/log_loss_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d49144c83549ece418f2b2014e27586b17419d28
--- /dev/null
+++ b/paddle/phi/kernels/impl/log_loss_kernel_impl.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   float epsilon,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto prediction = EigenVector<T>::Flatten(input);
+  auto label_out = EigenVector<T>::Flatten(label);
+
+  auto loss = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  phi::funcs::EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+      place, loss, prediction, label_out, epsilon);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index f2549c171dda00ecab0baf8b6a7cdfb26ddea4d0..495b93f2a4ef0f790d53605e4531af7040c6b2ad 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   const DenseTensor& input,
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
-    auto stream = dev_ctx.stream();
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims);
   }
 };
 #endif
@@ -329,8 +328,8 @@ void MatmulGradKernel(const Context& dev_ctx,
     x_conj = Conj<T>(dev_ctx, x);
     y_conj = Conj<T>(dev_ctx, y);
 
-    DenseTensor dx_help = Empty<T, Context>(dev_ctx);
-    DenseTensor dy_help = Empty<T, Context>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -686,8 +685,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
       y_conj = Conj<T>(dev_ctx, y);
     }
 
-    DenseTensor dx_help = Empty<T>(dev_ctx);
-    DenseTensor dy_help = Empty<T>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -1373,10 +1372,10 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
     VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
                "wastes the memory. So we should avoid the case in reality";
 
-    DenseTensor out_dx_help = Empty<T>(dev_ctx);
-    DenseTensor out_dy_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddx_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddy_help = Empty<T>(dev_ctx);
+    DenseTensor out_dx_help;
+    DenseTensor out_dy_help;
+    DenseTensor out_d_ddx_help;
+    DenseTensor out_d_ddy_help;
 
     if (out_d_dout) {
       ddx_conj = Conj<T>(dev_ctx, ddx);
diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e797b27071cacae200e3746032b4086bb1c3ae45
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+void MatrixPowerGradFunction(const DenseTensor* X,
+                             const DenseTensor* Out,
+                             const DenseTensor* dOut,
+                             const int n,
+                             DenseTensor* dX,
+                             const Context& ctx) {
+  ctx.template Alloc<T>(dX);
+  const auto& x_dims = X->dims();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  if (n == 0) {
+    // \nabla X = O
+    phi::funcs::SetConstant<Context, T> zero;
+    zero(ctx, dX, static_cast<T>(0));
+    return;
+  } else if (n == 1) {
+    // \nabla X = \nabla Out
+    paddle::framework::TensorCopy(*dOut, ctx.GetPlace(), ctx, dX);
+    return;
+  }
+
+  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (n == -1) {
+    // \nabla X = Out^{T} * \nabla Out * Out^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(*Out,
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                *Out,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+    return;
+  }
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  // Use chain rule blow to compute \nabla newX^{n}
+  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
+  // Note that newX^{0} can be omitted
+  std::vector<std::shared_ptr<DenseTensor>> tensor_list(new_n - 1);
+  tensor_list[0] = std::make_shared<DenseTensor>(new_x);
+  int index = 1;
+  while (index < new_n - 1) {
+    DenseTensor tensor_list_index;
+    tensor_list_index.Resize(X->dims());
+    ctx.template Alloc<T>(&tensor_list_index);
+    tensor_list[index] = std::make_shared<DenseTensor>(tensor_list_index);
+
+    blas.MatMul(*tensor_list[index - 1],
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                tensor_list[index].get(),
+                static_cast<T>(0));
+    index++;
+  }
+
+  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
+  //                      * \nabla Out
+  //                      * (newX^{T}^{n - i - 1})
+  DenseTensor dx_new;
+  dx_new.Resize(X->dims());
+  ctx.template Alloc<T>(&dx_new);
+  blas.MatMul(*tensor_list[new_n - 2],
+              trans_desc,
+              *dOut,
+              no_trans_desc,
+              static_cast<T>(1),
+              &dx_new,
+              static_cast<T>(0));
+  DenseTensor da_an_minus1;
+  da_an_minus1.Resize(X->dims());
+  ctx.template Alloc<T>(&da_an_minus1);
+  blas.MatMul(*dOut,
+              no_trans_desc,
+              *tensor_list[new_n - 2],
+              trans_desc,
+              static_cast<T>(1),
+              &da_an_minus1,
+              static_cast<T>(0));
+  blas.AXPY(
+      X->numel(), static_cast<T>(1), da_an_minus1.data<T>(), dx_new.data<T>());
+  int start = 0;
+  while (start < new_n - 2) {
+    DenseTensor a_da;
+    a_da.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da);
+    DenseTensor a_da_a;
+    a_da_a.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da_a);
+    blas.MatMul(*tensor_list[start],
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(1),
+                &a_da,
+                static_cast<T>(0));
+    blas.MatMul(a_da,
+                no_trans_desc,
+                *tensor_list[new_n - 3 - start],
+                trans_desc,
+                static_cast<T>(1),
+                &a_da_a,
+                static_cast<T>(0));
+    blas.AXPY(
+        X->numel(), static_cast<T>(1), a_da_a.data<T>(), dx_new.data<T>());
+    start++;
+  }
+
+  if (n > 0) {
+    // \nabla X = \nabla newX
+    paddle::framework::TensorCopy(dx_new, ctx.GetPlace(), ctx, dX);
+  } else {
+    // \nabla X = newX^{T} * \nabla newX * newX^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(new_x,
+                trans_desc,
+                dx_new,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                new_x,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad) {
+  auto X = &x;
+  auto Out = &out;
+  auto dOut = &out_grad;
+  auto dX = x_grad;
+
+  MatrixPowerGradFunction<Context, T>(X, Out, dOut, n, dX, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccc5e8757e87662d398006b931ea307c6328991e
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename T>
+struct IdentityMatrixFunctor {
+  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int row = index / m_ % m_;
+    const int col = index % m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_;
+  T* output_;
+};
+
+template <typename Context, typename T>
+void MatrixPowerFunction(const DenseTensor* X,
+                         const int n,
+                         DenseTensor* Out,
+                         const Context& ctx) {
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  T* out_data = ctx.template Alloc<T>(Out);
+
+  phi::funcs::ForRange<Context> for_range(ctx, X->numel());
+
+  if (n == 0) {
+    // Out = Identity Matrix
+    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
+    for_range(functor);
+    return;
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  if (new_n == 1) {
+    paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, Out);
+    return;
+  }
+
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (new_n == 2) {
+    // Out = newX * newX
+    ctx.template Alloc<T>(Out);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 3) {
+    // Out = (newX * newX) * newX
+    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
+    // gemm operations must be computable independently; otherwise,
+    // undefined behavior is expected.
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 4) {
+    // Out = (newX * newX) * (newX * newX)
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                temp,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  }
+
+  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
+  int bit = 0;
+  DenseTensor z = DenseTensor(X->dtype());
+  bool out_inited = false;
+  DenseTensor temp_out;
+  temp_out.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_out);
+  DenseTensor temp_z;
+  temp_z.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_z);
+  while (new_n > 0) {
+    bit = new_n & 0x1;
+    new_n >>= 1;
+    if (z.IsInitialized()) {
+      blas.MatMul(z,
+                  no_trans_desc,
+                  z,
+                  no_trans_desc,
+                  static_cast<T>(1),
+                  &temp_z,
+                  static_cast<T>(0));
+      paddle::framework::TensorCopy(temp_z, ctx.GetPlace(), ctx, &z);
+    } else {
+      z.Resize(X->dims());
+      ctx.template Alloc<T>(&z);
+      paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, &z);
+    }
+    if (bit == 1) {
+      if (out_inited == true) {
+        blas.MatMul(*Out,
+                    no_trans_desc,
+                    z,
+                    no_trans_desc,
+                    static_cast<T>(1),
+                    &temp_out,
+                    static_cast<T>(0));
+        paddle::framework::TensorCopy(temp_out, ctx.GetPlace(), ctx, Out);
+      } else {
+        paddle::framework::TensorCopy(z, ctx.GetPlace(), ctx, Out);
+        out_inited = true;
+      }
+    }
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out) {
+  const DenseTensor* X = &x;
+  auto Out = out;
+
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_ndim - 2],
+      x_dims[x_ndim - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) should be equal."
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          x_dims[x_ndim - 2],
+          x_dims[x_ndim - 1]));
+
+  MatrixPowerFunction<Context, T>(X, n, Out, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
similarity index 72%
rename from paddle/fluid/operators/matrix_rank_op.h
rename to paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
index 80774aa916920dd5c828498f4345bd85ea4f33f8..b0dd76a17eeb363d53f29ba3e6cb3e5bf209edfc 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,15 +13,11 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/controlflow/compare_op.h"
-#include "paddle/phi/core/ddim.h"
 
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+
+namespace phi {
 
 namespace detail {
 static DDim GetEigenvalueDim(const DDim& dim, int k) {
@@ -45,6 +41,18 @@ static DDim RemoveLastDim(const DDim& dim) {
   vec.erase(vec.end() - 1, vec.end());
   return phi::make_ddim(vec);
 }
+
+static DDim GetUDDim(const DDim& x_dim, int k) {
+  auto x_vec = phi::vectorize(x_dim);
+  x_vec[x_vec.size() - 1] = k;
+  return phi::make_ddim(x_vec);
+}
+
+static DDim GetVHDDim(const DDim& x_dim, int k) {
+  auto x_vec = phi::vectorize(x_dim);
+  x_vec[x_vec.size() - 2] = k;
+  return phi::make_ddim(x_vec);
+}
 }  // namespace detail
 
 template <typename T>
@@ -58,5 +66,4 @@ struct GreaterElementFunctor {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..546ea74674281b0524b970ea83cbe5e62afd779e
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  phi::funcs::SetConstant<Context, T> zero;
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    zero(dev_ctx, x_grad, static_cast<T>(0.0));
+    paddle::operators::math::MaxOutGradFunctor<Context, T> maxout_backward;
+    maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..da8c259ebf21716aed1fab336e94f5fafb2c69b6
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  paddle::operators::math::MaxOutFunctor<Context, T> maxout_forward;
+  maxout_forward(dev_ctx, x, out, groups, axis);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0833e94fe2c189f00e6dadff3f108753d0f66221
--- /dev/null
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -0,0 +1,456 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline DenseTensor MatMul(const Context& ctx,
+                          const DenseTensor& matrix_a,
+                          const DenseTensor& matrix_b,
+                          const phi::DDim& a_dim,
+                          const phi::DDim& b_dim) {
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor matrix_c;
+  phi::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
+  matrix_c.Resize(c_dim);
+  ctx.template Alloc<T>(&matrix_c);
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
+  const T alpha = static_cast<T>(1.0);
+  blas.MatMul(matrix_a.data<T>(),
+              mat_dim_a,
+              matrix_b.data<T>(),
+              mat_dim_b,
+              alpha,
+              matrix_c.data<T>(),
+              T(0));
+  return matrix_c;
+}
+
+/**
+ * @brief Recursively calculate matrix multiplication according to the optimal
+ * order
+ * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
+ *
+ * @param
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * save_result: set true by backward
+ * results: save the intermediate result during backward
+ */
+template <typename Context, typename T>
+inline DenseTensor MatChainMul(const Context& ctx,
+                               const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims,
+                               const std::vector<uint64_t>& order,
+                               const uint64_t i,
+                               const uint64_t j,
+                               const bool save_result,
+                               std::vector<DenseTensor>* results) {
+  if (i == j) {
+    return *ins[i];
+  }
+
+  const auto A = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         i,
+                                         order[i * ins.size() + j],
+                                         save_result,
+                                         results);
+  phi::DDim a_dim = A.dims();
+  if (i == order[i * ins.size() + j]) {
+    a_dim = ins_dims[i];
+  }
+
+  const auto B = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         order[i * ins.size() + j] + 1,
+                                         j,
+                                         save_result,
+                                         results);
+  phi::DDim b_dim = B.dims();
+  if (j == order[i * ins.size() + j] + 1) {
+    b_dim = ins_dims[j];
+  }
+
+  auto result = MatMul<Context, T>(ctx, A, B, a_dim, b_dim);
+  if (save_result) {
+    (*results)[i * ins.size() + j] = result;
+  }
+  return result;
+}
+
+/**
+ * @brief get the optimal order
+ */
+template <typename Context, typename T>
+std::vector<uint64_t> GetOrder(const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims) {
+  auto n = ins.size();
+  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
+  std::vector<uint64_t> p(n + 1);
+  for (uint64_t i = 0; i < n; i++) {
+    p[i] = ins_dims[i][0];
+  }
+  p[n] = ins_dims[n - 1][1];
+
+  // m[i, j]: save the lowest cost for multiplying ins[i...j]
+  std::vector<uint64_t> m(n * n, 0);
+  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
+  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
+  // multiply the resulting matrices is the optimal order for ins[i...j]
+  std::vector<uint64_t> order(n * n);
+  for (uint64_t l = 1; l < n; l++) {
+    for (uint64_t i = 0; i < n - l; i++) {
+      auto j = i + l;
+      m[i * n + j] = 0xffffffff;
+      for (uint64_t k = i; k < j; k++) {
+        uint64_t q =
+            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
+        if (q < m[i * n + j]) {
+          m[i * n + j] = q;
+          order[i * n + j] = k;
+        }
+      }
+    }
+  }
+  return order;
+}
+
+template <typename Context, typename T>
+static inline DenseTensor MultiDotMatChainOrder(
+    const Context& ctx,
+    const std::vector<const DenseTensor*>& ins,
+    const std::vector<phi::DDim>& ins_dims,
+    const bool save_result,
+    std::vector<DenseTensor>* results) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  return MatChainMul<Context, T>(
+      ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results);
+}
+
+template <typename Context, typename T>
+inline void GetDims(const std::vector<const DenseTensor*>& ins,
+                    std::vector<phi::DDim>* ins_dims) {
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    (*ins_dims)[i] = ins[i]->dims();
+    if (i == 0 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
+    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out) {
+  auto ins = x;
+  ctx.template Alloc<T>(out);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  auto n = ins.size();
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  const T scale = static_cast<T>(1.0);
+  if (n == 2) {
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ma, Nb});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
+    } else {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ka, Nc});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      std::cout << tmp_out << std::endl;
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
+    }
+  } else {
+    std::vector<DenseTensor> results;
+    const auto tmp =
+        MultiDotMatChainOrder<Context, T>(ctx, ins, ins_dims, false, &results);
+    auto out_dim = out->dims();
+    *out = tmp;
+    out->Resize(out_dim);
+  }
+}
+
+/**
+ * @brief calculate dA and dB
+ * dA = dout * transpose(B)
+ * dB = transpose(A) * dout
+ */
+template <typename Context, typename T>
+void CalcGrad(const Context& ctx,
+              const DenseTensor& dout,
+              const DenseTensor& A,
+              const DenseTensor& B,
+              const phi::DDim& dout_dim,
+              const phi::DDim& a_dim,
+              const phi::DDim& b_dim,
+              DenseTensor* dA,
+              DenseTensor* dB) {
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
+  T alpha = static_cast<T>(1.0);
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+  blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
+  blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
+}
+
+/**
+ * @brief calculate multi matrix multiplication grad by a chain order
+ * @param
+ * dout: the grad of multi matrix multiplication out
+ * dx: the out grad of inputs
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * results: the intermediate result of farward
+ */
+template <typename Context, typename T>
+void MatChainMulGrad(const Context& ctx,
+                     const DenseTensor& dout,
+                     std::vector<DenseTensor*>* dx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const phi::DDim& dout_dim,
+                     const std::vector<phi::DDim>& ins_dims,
+                     const std::vector<uint64_t>& order,
+                     const uint64_t i,
+                     const uint64_t j,
+                     const std::vector<DenseTensor>& results) {
+  if (i == j) {
+    *((*dx)[i]) = dout;
+    return;
+  }
+
+  const auto n = ins.size();
+  const auto right = order[i * n + j];
+  const auto left = order[i * n + j] + 1;
+  // get the multi result of left sub chain
+  const auto* A = &results[i * n + right];
+  phi::DDim a_dim = A->dims();
+  if (i == right) {
+    A = ins[i];
+    a_dim = ins_dims[i];
+  }
+  // get the multi result of right sub chain
+  const auto* B = &results[left * n + j];
+  phi::DDim b_dim = B->dims();
+  if (left == j) {
+    B = ins[j];
+    b_dim = ins_dims[j];
+  }
+  DenseTensor dA, dB;
+  dA.Resize({dout_dim[0], b_dim[0]});
+  dB.Resize({a_dim[1], dout_dim[1]});
+  ctx.template Alloc<T>(&dA);
+  ctx.template Alloc<T>(&dB);
+
+  CalcGrad<Context, T>(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
+  MatChainMulGrad<Context, T>(
+      ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results);
+  MatChainMulGrad<Context, T>(
+      ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results);
+}
+
+template <typename Context, typename T>
+void MultiDotGradMatChainOrder(const Context& ctx,
+                               const DenseTensor& dout,
+                               const std::vector<const DenseTensor*>& ins,
+                               const phi::DDim& dout_dim,
+                               const std::vector<phi::DDim>& ins_dims,
+                               std::vector<DenseTensor*>* dx) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  auto n = ins.size();
+  std::vector<DenseTensor> results(n * n);
+  MatChainMul<Context, T>(ctx, ins, ins_dims, order, 0, n - 1, true, &results);
+  MatChainMulGrad<Context, T>(
+      ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results);
+}
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad) {
+  auto ins = x;
+  auto dout = out_grad;
+  auto dx = x_grad;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    ctx.template Alloc<T>(dx[i]);
+  }
+
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  phi::DDim dout_dim = dout.dims();
+  if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
+    dout_dim = phi::make_ddim({1, 1});
+  } else if (ins[0]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({1, dout_dim[0]});
+    }
+  } else if (ins[n - 1]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({dout_dim[0], 1});
+    }
+  }
+
+  T alpha = static_cast<T>(1);
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  if (n == 2) {
+    CalcGrad<Context, T>(ctx,
+                         dout,
+                         *ins[0],
+                         *ins[1],
+                         dout_dim,
+                         ins_dims[0],
+                         ins_dims[1],
+                         dx[0],
+                         dx[1]);
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ma, Nb});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({mat_dim_dout.height_, Nb});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           tmp_out,
+                           *ins[2],
+                           dout_dim,
+                           tmp_out.dims(),
+                           ins_dims[2],
+                           &tmp_dout,
+                           dx[2]);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[0],
+                           *ins[1],
+                           tmp_dout.dims(),
+                           ins_dims[0],
+                           ins_dims[1],
+                           dx[0],
+                           dx[1]);
+    } else {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ka, Nc});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({Ka, mat_dim_dout.width_});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           *ins[0],
+                           tmp_out,
+                           dout_dim,
+                           ins_dims[0],
+                           tmp_dout.dims(),
+                           dx[0],
+                           &tmp_dout);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[1],
+                           *ins[2],
+                           tmp_dout.dims(),
+                           ins_dims[1],
+                           ins_dims[2],
+                           dx[1],
+                           dx[2]);
+    }
+  } else {
+    MultiDotGradMatChainOrder<Context, T>(
+        ctx, dout, ins, dout_dim, ins_dims, &dx);
+    if (ins[n - 1]->dims().size() == 1) {
+      dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pad_grad_kernel_impl.h b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..91f198f9fb681e4fabf7029fcc22343bb81953fd
--- /dev/null
+++ b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x) {
+  if (d_x == nullptr) {
+    return;
+  }
+  dev_ctx.template Alloc<T>(d_x);
+  int rank = d_out.dims().size();
+  phi::funcs::PaddingGradFunctor<Context, T>(
+      rank, dev_ctx, paddings, d_out, d_x);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3ebb0dfe03b2f13e2a321bb813f7d10e306b7a
--- /dev/null
+++ b/paddle/phi/kernels/impl/pad_kernel_impl.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  int rank = x.dims().size();
+  funcs::PaddingFunctor<Context, T>(
+      rank, dev_ctx, paddings, static_cast<T>(pad_value), x, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fe89ce34c8b5a33df12c1931caeddb37de5aea2
--- /dev/null
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -0,0 +1,332 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/pool_grad_kernel.h"
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoolGradRawKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       const std::vector<int>& kernel_size,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       bool exclusive,
+                       const std::string& data_format,
+                       const std::string& pooling_type,
+                       bool global_pooling,
+                       bool adaptive,
+                       const std::string& padding_algorithm,
+                       DenseTensor* dx) {
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  // update paddings
+  auto x_dims = x.dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+    funcs::SetConstant<Context, T> set_constant;
+    set_constant(ctx, dx, static_cast<T>(0.0));
+
+    switch (kernel_size_.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          funcs::MaxPool2dGradFunctor<Context, T> pool2d_backward;
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          dx);
+        } else if (pooling_type == "avg") {
+          funcs::Pool2dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
+              pool2d_backward;
+          funcs::AvgPoolGrad<T> pool_process;
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          funcs::MaxPool3dGradFunctor<Context, T> pool3d_backward;
+          pool3d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          dx);
+        } else if (pooling_type == "avg") {
+          funcs::Pool3dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
+              pool3d_backward;
+          funcs::AvgPoolGrad<T> pool_process;
+          pool3d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
+        }
+      } break;
+      default: {
+        PADDLE_THROW(
+            errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+      }
+    }
+  }
+}
+
+template <typename Context, typename T1, typename T2 = int>
+void MaxPoolWithIndexGradRawKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& mask,
+                                   const DenseTensor& dout,
+                                   const std::vector<int>& kernel_size,
+                                   const std::vector<int>& strides,
+                                   const std::vector<int>& paddings,
+                                   bool global_pooling,
+                                   bool adaptive,
+                                   DenseTensor* dx) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  if (global_pooling) {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(dx->dims()[i + 2]);
+    }
+  }
+
+  if (dx) {
+    ctx.template Alloc<T1>(dx);
+    funcs::set_constant(ctx, dx, 0);
+
+    switch (kernel_size_.size()) {
+      case 2: {
+        funcs::MaxPool2dWithIndexGradFunctor<Context, T1, T2> pool2d_backward;
+        pool2d_backward(
+            ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx);
+      } break;
+      case 3: {
+        funcs::MaxPool3dWithIndexGradFunctor<Context, T1, T2> pool3d_backward;
+        pool3d_backward(
+            ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx);
+      } break;
+      default: {
+        PADDLE_THROW(
+            errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx) {
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                dx);
+}
+
+template <typename T, typename Context>
+void Pool2dDoubleGradKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* out) {
+  if (pooling_type == "max") {
+    PADDLE_THROW(
+        errors::InvalidArgument("Pool op grad grad only supports avgpool."));
+  } else {
+    Pool2dKernel<T, Context>(ctx,
+                             x,
+                             kernel_size,
+                             strides,
+                             paddings,
+                             ceil_mode,
+                             exclusive,
+                             data_format,
+                             pooling_type,
+                             global_pooling,
+                             adaptive,
+                             padding_algorithm,
+                             out);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx) {
+  MaxPoolWithIndexGradRawKernel<Context, T>(ctx,
+                                            x,
+                                            mask,
+                                            dout,
+                                            kernel_size,
+                                            strides,
+                                            paddings,
+                                            global_pooling,
+                                            adaptive,
+                                            dx);
+}
+
+template <typename T, typename Context>
+void Pool3dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx) {
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                dx);
+}
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx) {
+  MaxPoolWithIndexGradRawKernel<Context, T>(ctx,
+                                            x,
+                                            mask,
+                                            dout,
+                                            kernel_size,
+                                            strides,
+                                            paddings,
+                                            global_pooling,
+                                            adaptive,
+                                            dx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..665d02fd0173e0b4dec7de7bfbf89cfa13d92f3f
--- /dev/null
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -0,0 +1,321 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/pool_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+
+#if defined(__HIPCC__) || defined(__NVCC__)
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#endif
+
+namespace phi {
+
+inline int GetReduceNum(const DenseTensor& input,
+                        const DenseTensor* output,
+                        const std::string data_format,
+                        std::vector<int>* reduce_dim) {
+  // data_format only can be NCHW
+  bool channel_last = (data_format == "NHWC");
+  if (channel_last) {
+    return 0;
+  }
+  int reduce_num = 0;
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  if ((output_height == 1) && (output_width == 1)) {
+    reduce_dim->push_back(2);
+    reduce_dim->push_back(3);
+    reduce_num = input.dims()[2] * input.dims()[3];
+  }
+  return reduce_num;
+}
+
+template <typename T, typename Context>
+void PoolRawKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& kernel_size,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool exclusive,
+                   const std::string& data_format,
+                   const std::string& pooling_type,
+                   bool global_pooling,
+                   bool adaptive,
+                   const std::string& padding_algorithm,
+                   DenseTensor* out) {
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  // update paddings
+  auto x_dims = x.dims();
+  DDim data_dims;
+  if (channel_last) {
+    data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  } else {
+    data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  }
+
+  funcs::UpdatePadding(&paddings_,
+                       global_pooling,
+                       adaptive,
+                       padding_algorithm,
+                       data_dims,
+                       strides,
+                       kernel_size_);
+
+  if (data_dims.size() * 2 == static_cast<int>(paddings_.size())) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      paddings_.erase(paddings_.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    funcs::UpdateKernelSize(&kernel_size_, data_dims);
+  }
+
+  switch (kernel_size_.size()) {
+    case 2: {
+      if (pooling_type == "max") {
+        funcs::Pool2dFunctor<Context, funcs::MaxPool<T>, T> pool2d_forward;
+        funcs::MaxPool<T> pool_process;
+        pool2d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       true,
+                       false,
+                       out,
+                       pool_process);
+
+      } else if (pooling_type == "avg") {
+        std::vector<int> reduce_dim;
+        int reduce_num = GetReduceNum(x, out, data_format, &reduce_dim);
+        if (reduce_num > 0 &&
+            adaptive) {  // for adaptive_avg_pool2d && output_size == 1
+#if defined(__HIPCC__) || defined(__NVCC__)
+          auto stream = ctx.stream();
+          funcs::ReduceKernel<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
+              ctx, x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim);
+#else  // for cpu
+          funcs::Pool2dFunctor<Context, funcs::AvgPool<T>, T> pool2d_forward;
+          funcs::AvgPool<T> pool_process;
+          pool2d_forward(ctx,
+                         x,
+                         kernel_size_,
+                         strides,
+                         paddings_,
+                         data_format,
+                         exclusive,
+                         adaptive,
+                         out,
+                         pool_process);
+#endif
+        } else {  // avgpool_2d or  adaptive_avg_pool2d && output_size != 1
+          funcs::Pool2dFunctor<Context, funcs::AvgPool<T>, T> pool2d_forward;
+          funcs::AvgPool<T> pool_process;
+          pool2d_forward(ctx,
+                         x,
+                         kernel_size_,
+                         strides,
+                         paddings_,
+                         data_format,
+                         exclusive,
+                         adaptive,
+                         out,
+                         pool_process);
+        }
+      }
+    } break;
+    case 3: {
+      if (pooling_type == "max") {
+        funcs::Pool3dFunctor<Context, funcs::MaxPool<T>, T> pool3d_forward;
+        funcs::MaxPool<T> pool_process;
+        pool3d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       true,
+                       false,
+                       out,
+                       pool_process);
+      } else if (pooling_type == "avg") {
+        funcs::Pool3dFunctor<Context, funcs::AvgPool<T>, T> pool3d_forward;
+        funcs::AvgPool<T> pool_process;
+        pool3d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       exclusive,
+                       adaptive,
+                       out,
+                       pool_process);
+      }
+    } break;
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+    }
+  }
+}
+
+template <typename Context, typename T1, typename T2 = int>
+void MaxPoolWithIndexRawKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const std::vector<int>& kernel_size,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               bool global_pooling,
+                               bool adaptive,
+                               DenseTensor* out,
+                               DenseTensor* mask) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> kernel_size_ = kernel_size;
+
+  if (global_pooling) {
+    for (size_t i = 0; i < kernel_size_.size(); ++i) {
+      paddings_[i] = 0;
+      kernel_size_[i] = static_cast<int>(x.dims()[i + 2]);
+    }
+  }
+
+  switch (kernel_size_.size()) {
+    case 2: {
+      funcs::MaxPool2dWithIndexFunctor<Context, T1, T2> pool2d_forward;
+      pool2d_forward(
+          ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask);
+    } break;
+    case 3: {
+      funcs::MaxPool3dWithIndexFunctor<Context, T1, T2> pool3d_forward;
+      pool3d_forward(
+          ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask);
+    } break;
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Pool op only supports 2D and 3D input."));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Pool2dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out) {
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            out);
+}
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask) {
+  MaxPoolWithIndexRawKernel<Context, T>(ctx,
+                                        x,
+                                        kernel_size,
+                                        strides,
+                                        paddings,
+                                        global_pooling,
+                                        adaptive,
+                                        out,
+                                        mask);
+}
+
+template <typename T, typename Context>
+void Pool3dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out) {
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            out);
+}
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask) {
+  MaxPoolWithIndexRawKernel<Context, T>(ctx,
+                                        x,
+                                        kernel_size,
+                                        strides,
+                                        paddings,
+                                        global_pooling,
+                                        adaptive,
+                                        out,
+                                        mask);
+}
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/searchsorted_op.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
similarity index 58%
rename from paddle/fluid/operators/searchsorted_op.h
rename to paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index 6aa38a815813230c2921f3d3816881966df6bf98..82bd9fba2a66d7a4601b5aab360b9bbf80ff04d9 100644
--- a/paddle/fluid/operators/searchsorted_op.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,16 +16,11 @@
 
 #include <math.h>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
+namespace phi {
 
 template <typename T1, typename T2, typename OutType>
 class GpuAndCpuSearchSortedCompute {
@@ -65,9 +60,11 @@ class GpuAndCpuSearchSortedCompute {
   static HOSTDEVICE bool IsInf(int64_t x) { return false; }
 
   HOSTDEVICE GpuAndCpuSearchSortedCompute(const T1* sequence_data,
-                                          const T2* value_data, bool right,
+                                          const T2* value_data,
+                                          bool right,
                                           bool is_1d_boundaries,
-                                          int64_t val_size, int64_t seq_size,
+                                          int64_t val_size,
+                                          int64_t seq_size,
                                           OutType* out_data)
       : sequence_data_(sequence_data),
         value_data_(value_data),
@@ -104,12 +101,13 @@ class GpuAndCpuSearchSortedCompute {
   OutType* out_data_;
 };
 
-template <typename DeviceContext, typename T1, typename OutType>
+template <typename Context, typename T1, typename OutType>
 class SearchSortedFunctor {
  public:
-  SearchSortedFunctor(const framework::ExecutionContext& context,
-                      const framework::Tensor* sorted_sequence,
-                      const framework::Tensor* value, bool right,
+  SearchSortedFunctor(const Context& context,
+                      const DenseTensor* sorted_sequence,
+                      const DenseTensor* value,
+                      bool right,
                       OutType* out_data)
       : context_(context),
         sorted_sequence_(sorted_sequence),
@@ -121,74 +119,73 @@ class SearchSortedFunctor {
   void apply() {
     const T1* sequence_data = sorted_sequence_->data<T1>();
     const T2* value_data = value_->data<T2>();
-    const framework::DDim& seq_dims = sorted_sequence_->dims();
-    const framework::DDim& val_dims = value_->dims();
+    const phi::DDim& seq_dims = sorted_sequence_->dims();
+    const phi::DDim& val_dims = value_->dims();
 
     bool is_1d_boundaries = seq_dims.size() == 1;
     int64_t val_size = val_dims[val_dims.size() - 1];
     int64_t seq_size = seq_dims[seq_dims.size() - 1];
 
-    auto& dev_ctx = context_.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, value_->numel());
+    funcs::ForRange<Context> for_range(context_, value_->numel());
     GpuAndCpuSearchSortedCompute<T1, T2, OutType>
-        gpu_and_cpu_search_sorted_compute(sequence_data, value_data, right_,
-                                          is_1d_boundaries, val_size, seq_size,
+        gpu_and_cpu_search_sorted_compute(sequence_data,
+                                          value_data,
+                                          right_,
+                                          is_1d_boundaries,
+                                          val_size,
+                                          seq_size,
                                           out_data_);
     for_range(gpu_and_cpu_search_sorted_compute);
   }
 
  private:
-  const framework::ExecutionContext& context_;
-  const framework::Tensor* sorted_sequence_;
-  const framework::Tensor* value_;
+  const Context& context_;
+  const DenseTensor* sorted_sequence_;
+  const DenseTensor* value_;
   bool right_;
   OutType* out_data_;
 };
 
 template <typename Visitor>
-static void VisitDataType(framework::proto::VarType::Type type,
-                          Visitor visitor) {
-  if (type == framework::proto::VarType::FP32) {
+static void VisitDataType(DataType type, Visitor visitor) {
+  if (type == DataType::FLOAT32) {
     visitor.template apply<float>();
-  } else if (type == framework::proto::VarType::FP64) {
+  } else if (type == DataType::FLOAT64) {
     visitor.template apply<double>();
-  } else if (type == framework::proto::VarType::INT32) {
+  } else if (type == DataType::INT32) {
     visitor.template apply<int>();
-  } else if (type == framework::proto::VarType::INT64) {
+  } else if (type == DataType::INT64) {
     visitor.template apply<int64_t>();
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(errors::InvalidArgument(
         "The recieved values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
         "float32, float64, int32 or int64. Please input appropriate "
         "sorted_sequence again! ",
-        framework::DataTypeToString(type)));
+        type));
   }
 }
 
-template <typename DeviceContext, typename T>
-class SearchSortedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* sorted_sequence = context.Input<Tensor>("SortedSequence");
-    auto* value = context.Input<Tensor>("Values");
-    bool out_int32 = context.Attr<bool>("out_int32");
-    bool right = context.Attr<bool>("right");
-    auto* out = context.Output<Tensor>("Out");
-
-    if (out_int32) {
-      int* out_data = out->mutable_data<int>(context.GetPlace());
-      SearchSortedFunctor<DeviceContext, T, int> functor(
-          context, sorted_sequence, value, right, out_data);
-      VisitDataType(framework::TransToProtoVarType(value->dtype()), functor);
-    } else {
-      int64_t* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      SearchSortedFunctor<DeviceContext, T, int64_t> functor(
-          context, sorted_sequence, value, right, out_data);
-      VisitDataType(framework::TransToProtoVarType(value->dtype()), functor);
-    }
+template <typename T, typename Context>
+void SearchsortedKernel(const Context& ctx,
+                        const DenseTensor& sorted_sequence,
+                        const DenseTensor& value,
+                        bool out_int32,
+                        bool right,
+                        DenseTensor* out) {
+  if (out_int32) {
+    ctx.template Alloc<int>(out);
+    int* out_data = out->data<int>();
+    SearchSortedFunctor<Context, T, int> functor(
+        ctx, &sorted_sequence, &value, right, out_data);
+    VisitDataType(value.dtype(), functor);
+  } else {
+    ctx.template Alloc<int64_t>(out);
+    int64_t* out_data = out->data<int64_t>();
+    SearchSortedFunctor<Context, T, int64_t> functor(
+        ctx, &sorted_sequence, &value, right, out_data);
+    VisitDataType(value.dtype(), functor);
   }
-};
+}
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ba1a0c6b6c0fe37c6bea0a39016311d9b7cd5ff
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto index_type = segment_ids.type();
+  if (index_type == DataType::INT32) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int64_t> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a6df37ab3e35cacb4313f242ebb82e8d9061c41
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& segment_ids,
+                               const std::string& pooltype,
+                               DenseTensor* out,
+                               DenseTensor* summed_ids) {
+  int64_t num_indices = segment_ids.numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices,
+      x.dims()[0],
+      phi::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices,
+                    segment_ids.dims()[0],
+                    phi::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment_ids.dims()));
+
+  if (x.numel() == 0 || segment_ids.numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU;
+  if (cpu_place) {
+    auto dims = x.dims();
+    auto* segment_ids_ptr = segment_ids.data<IndexT>();
+    dims[0] =
+        static_cast<int64_t>(segment_ids_ptr[segment_ids.numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, out, static_cast<T>(0));
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (!cpu_place) {
+    DenseTensor length;
+    length.Resize(phi::make_ddim({1}));
+    IndexT* length_data = dev_ctx.template HostAlloc<IndexT>(&length);
+
+    const IndexT* segment_ids_ptr = segment_ids.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(length_data,
+                                         segment_ids_ptr + num_indices - 1,
+                                         sizeof(IndexT),
+                                         hipMemcpyDeviceToHost));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          cudaMemcpyDeviceToHost));
+#endif
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host,
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = x.dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    phi::funcs::SetConstant<Context, T> setconst;
+    setconst(dev_ctx, out, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids->Resize({dims[0], 1});
+      dev_ctx.template Alloc<T>(summed_ids);
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
+
+  phi::funcs::SegmentPoolFunctor<Context, T, IndexT> pool;
+
+  pool(dev_ctx, x, segment_ids, out, summed_ids, pooltype);
+}
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids) {
+  auto index_type = segment_ids.dtype();
+  if (index_type == DataType::INT32) {
+    SegmentKernelLaunchHelper<Context, T, int>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else if (index_type == DataType::INT64) {
+    SegmentKernelLaunchHelper<Context, T, int64_t>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4947170088cba9701ad1065098451b97139bfc95
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/operators/strided_slice_op.h"
+
+namespace phi {
+
+inline void GetOffsets(const DDim& big_dim,
+                       const DDim& small_dim,
+                       DDim start_offset,
+                       int cur_dim,
+                       std::vector<DDim>* offsets) {
+  if (cur_dim == big_dim.size()) {
+    offsets->push_back(start_offset);
+    return;
+  }
+  if (small_dim[cur_dim] == big_dim[cur_dim]) {
+    GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
+  } else {
+    for (int i = 0; i < big_dim[cur_dim]; i++) {
+      GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets);
+      start_offset[cur_dim] += 1;
+    }
+  }
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueGradImpl(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const ScalarArray& starts,
+                      const ScalarArray& ends,
+                      const ScalarArray& steps,
+                      const std::vector<int64_t>& axes,
+                      const std::vector<int64_t>& decrease_axes,
+                      const std::vector<int64_t>& none_axes,
+                      DenseTensor* x_grad,
+                      DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      out_grad.IsInitialized(),
+      true,
+      errors::PermissionDenied(
+          "The input of `set_value_grad`(out_grad) has not been initialized"));
+
+  auto in_dims = out_grad.dims();
+
+  std::vector<int> decrease_axis_int32(decrease_axes.begin(),
+                                       decrease_axes.end());
+  std::vector<int> axes_int32(axes.begin(), axes.end());
+  std::vector<int> infer_flags(axes.size(), 1);
+  std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::StridedSliceOutDims(starts_local,
+                                         ends_local,
+                                         steps_local,
+                                         axes_int32,
+                                         infer_flags,
+                                         in_dims,
+                                         decrease_axis_int32,
+                                         out_dims_vector.data(),
+                                         axes.size(),
+                                         false);
+
+  DDim out_dims(phi::make_ddim(out_dims_vector));
+
+  std::vector<int> reverse_vector(starts_local.size(), 0);
+  paddle::operators::StridedSliceFunctor(starts_local.data(),
+                                         ends_local.data(),
+                                         steps_local.data(),
+                                         axes_int32.data(),
+                                         reverse_vector.data(),
+                                         in_dims,
+                                         infer_flags,
+                                         decrease_axis_int32,
+                                         starts_local.size());
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto steps_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto reverse_axis = Eigen::array<bool, RANK>();
+
+  for (size_t axis = 0; axis < RANK; axis++) {
+    starts_indices[axis] = 0;
+    ends_indices[axis] = out_dims[axis];
+    steps_indices[axis] = 1;
+    reverse_axis[axis] = false;
+  }
+
+  for (size_t axis = 0; axis < axes.size(); axis++) {
+    int axis_index = axes[axis];
+    starts_indices[axis_index] = starts_local[axis];
+    ends_indices[axis_index] = ends_local[axis];
+    steps_indices[axis_index] = steps_local[axis];
+    reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false;
+  }
+
+  bool need_reverse = false;
+  for (size_t axis = 0; axis < axes.size(); axis++) {
+    if (reverse_vector[axis] == 1) {
+      need_reverse = true;
+      break;
+    }
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  if (x_grad) {
+    // Set gradient of `Input`
+    Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+
+    auto x_grad_t =
+        EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(*x_grad);
+
+    DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+    auto tmp_t =
+        EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+    x_grad_t.stridedSlice(starts_indices, ends_indices, steps_indices)
+        .device(place) = tmp_t;
+  }
+  if (value_grad) {
+    dev_ctx.template Alloc<T>(value_grad);
+    set_zero(dev_ctx, value_grad, static_cast<T>(0));
+
+    auto in_t = EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+        out_grad);
+
+    if (value_grad->dims() == out_dims) {
+      auto value_grad_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              *value_grad);
+      if (need_reverse) {
+        DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+        auto tmp_t =
+            EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+        tmp_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+        value_grad_t.device(place) = tmp_t.reverse(reverse_axis);
+      } else {
+        value_grad_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+      }
+    } else {
+      int out_dims_size = out_dims.size();
+      auto value_grad_dims = value_grad->dims();
+      auto fake_value_grad_dims = out_dims;
+
+      // Create an extented shape according to the rules of broadcast.
+      auto value_grad_dims_size = value_grad_dims.size();
+
+      int num_decrease = 0;
+
+      int decrease_axis_size = decrease_axes.size();
+      for (int i = 0; i < out_dims_size; i++) {
+        if (decrease_axes.end() !=
+            std::find(decrease_axes.begin(), decrease_axes.end(), i)) {
+          fake_value_grad_dims[i] = 1;
+          num_decrease++;
+        } else if (i < out_dims_size - (value_grad_dims_size +
+                                        decrease_axis_size - num_decrease)) {
+          fake_value_grad_dims[i] = 1;
+        } else {
+          auto index_grad =
+              i - (out_dims_size -
+                   (value_grad_dims_size + decrease_axis_size - num_decrease));
+          fake_value_grad_dims[i] = value_grad_dims[index_grad];
+
+          PADDLE_ENFORCE_EQ((out_dims[i] == value_grad_dims[index_grad]) ||
+                                (value_grad_dims[index_grad] == 1),
+                            true,
+                            errors::InvalidArgument(
+                                "An error occurred while calculating %s: "
+                                "[%s] can not be accumulated into [%s].",
+                                paddle::framework::GradVarName("ValueTensor"),
+                                out_dims,
+                                value_grad_dims));
+        }
+      }
+
+      VLOG(3) << "Dimensions of "
+              << paddle::framework::GradVarName("ValueTensor") << "(["
+              << value_grad_dims << "])is broadcasted into ["
+              << fake_value_grad_dims << "].";
+
+      auto extent = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+      auto offset = out_dims;
+      for (int i = 0; i < out_dims_size; i++) {
+        offset[i] = 0;
+        extent[i] = fake_value_grad_dims[i];
+      }
+      std::vector<DDim> offsets;
+      GetOffsets(out_dims, fake_value_grad_dims, offset, 0, &offsets);
+
+      auto value_grad_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              *value_grad, fake_value_grad_dims);
+
+      DenseTensor tmp = Full<T>(dev_ctx, out_dims_vector, static_cast<T>(0));
+      auto tmp_t =
+          EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(tmp);
+
+      tmp_t.device(place) =
+          in_t.stridedSlice(starts_indices, ends_indices, steps_indices);
+
+      // accumulate gradient
+      for (auto offset : offsets) {
+        value_grad_t.device(place) =
+            value_grad_t + tmp_t.slice(EigenDim<RANK>::From(offset), extent);
+      }
+      if (need_reverse) {
+        DenseTensor tmp_value =
+            Full<T>(dev_ctx,
+                    {fake_value_grad_dims.Get(), fake_value_grad_dims.size()},
+                    static_cast<T>(0));
+        auto tmp_value_t =
+            EigenTensor<T, RANK, Eigen::RowMajor, Eigen::DenseIndex>::From(
+                tmp_value);
+        tmp_value_t.device(place) = value_grad_t.reverse(reverse_axis);
+        value_grad_t.device(place) = tmp_value_t;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SetValueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out_grad,
+                        const ScalarArray& starts,
+                        const ScalarArray& ends,
+                        const ScalarArray& steps,
+                        const std::vector<int64_t>& axes,
+                        const std::vector<int64_t>& decrease_axes,
+                        const std::vector<int64_t>& none_axes,
+                        DenseTensor* x_grad,
+                        DenseTensor* value_grad) {
+  const int rank = out_grad.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueGradImpl<T, Context, 1>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 2:
+      SetValueGradImpl<T, Context, 2>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 3:
+      SetValueGradImpl<T, Context, 3>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 4:
+      SetValueGradImpl<T, Context, 4>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 5:
+      SetValueGradImpl<T, Context, 5>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    case 6:
+      SetValueGradImpl<T, Context, 6>(dev_ctx,
+                                      out_grad,
+                                      starts,
+                                      ends,
+                                      steps,
+                                      axes,
+                                      decrease_axes,
+                                      none_axes,
+                                      x_grad,
+                                      value_grad);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "The rank of set_value_grad's input should be less than 7, but "
+          "received %d.",
+          rank));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..99db559f3b8166258a80814859c3296933634db8
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -0,0 +1,336 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+#include "paddle/fluid/operators/slice_utils.h"
+
+namespace phi {
+
+// check whether the tensor with dimension of second can assign to the
+// tensor with dimension of first
+inline void CheckIsDimsMatch(const DDim& first, const DDim& second) {
+  int ignore_axis1 = 0, ignore_axis2 = 0;
+  for (; ignore_axis1 < first.size(); ++ignore_axis1) {
+    if (first[ignore_axis1] != 1) {
+      break;
+    }
+  }
+  for (; ignore_axis2 < second.size(); ++ignore_axis2) {
+    if (second[ignore_axis2] != 1) {
+      break;
+    }
+  }
+
+  if (second.size() == ignore_axis2) {
+    // second tensor has only one value
+    return;
+  }
+
+  if (first.size() - ignore_axis1 >= second.size() - ignore_axis2) {
+    auto idx1 = first.size() - 1;
+    auto idx2 = second.size() - 1;
+    bool is_match = true;
+    for (; idx2 >= ignore_axis2; idx2--) {
+      if (first[idx1--] != second[idx2] && second[idx2] != 1) {
+        is_match = false;
+        break;
+      }
+    }
+    if (is_match) {
+      return;
+    }
+  }
+  PADDLE_THROW(errors::InvalidArgument(
+      "The shape of tensor assigned value must match the shape "
+      "of target shape: %d, but now shape is %d.",
+      second.to_str(),
+      first.to_str()));
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueImpl(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& value,
+                  const ScalarArray& starts,
+                  const ScalarArray& ends,
+                  const ScalarArray& steps,
+                  const std::vector<int64_t>& axes,
+                  const std::vector<int64_t>& decrease_axes,
+                  const std::vector<int64_t>& none_axes,
+                  DenseTensor* out) {
+  auto in_dims = in.dims();
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::CheckAndUpdateSliceAttrs(
+      in_dims, axes, &starts_local, &ends_local, &steps_local);
+  auto slice_dims = paddle::operators::GetSliceDims(
+      in_dims, axes, starts_local, ends_local, &steps_local);
+  auto decrease_slice_dims =
+      paddle::operators::GetDecreasedDims(slice_dims, decrease_axes);
+
+  auto slice_dims_for_assign = decrease_slice_dims;
+  if (!none_axes.empty()) {
+    std::vector<int64_t> slice_dims_with_none;
+
+    size_t none_axes_cur = 0, decrease_axes_cur = 0;
+    for (int i = 0; i < slice_dims.size(); ++i) {
+      while (none_axes_cur < none_axes.size() &&
+             none_axes[none_axes_cur] <= i) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
+      if (decrease_axes_cur < decrease_axes.size() &&
+          decrease_axes[decrease_axes_cur] == i) {
+        decrease_axes_cur++;
+      } else {
+        slice_dims_with_none.push_back(slice_dims[i]);
+      }
+    }
+    while (none_axes_cur < none_axes.size()) {
+      slice_dims_with_none.push_back(1);
+      none_axes_cur++;
+    }
+
+    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+  }
+
+  auto place = dev_ctx.GetPlace();
+  auto& eigen_place = *dev_ctx.eigen_device();
+
+  // Here copy data from input to avoid data loss at PE and Graph level.
+  // TODO(liym27): Speed up in the future version.
+  // - Q: Why don't call ShareDataWith to speed up?
+  // - A: Because it's not supported to ShareDataWith on OP's input and output
+  // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
+  // - Q: Why don't delete Input, after all, the input and output are the same
+  // Tensor at program level?
+  // - A: If deleting Input, the graph will be complex, such as there will
+  // be two ops points to the output in graph: op1 -> output <- set_value.
+  // In this case, we have to find a way to handle the running order of
+  // set_value is what we want.
+  Copy(dev_ctx, in, place, false, out);
+
+  DenseTensor slice_tensor =
+      Empty<T>(dev_ctx, ScalarArray{slice_dims.Get(), slice_dims.size()});
+  DenseTensor pad_tensor =
+      Empty<T>(dev_ctx, ScalarArray{in_dims.Get(), in_dims.size()});
+
+  auto pad_e = EigenTensor<T, RANK>::From(pad_tensor, in_dims);
+  auto out_e = EigenTensor<T, RANK>::From(*out);
+  auto slice_e = EigenTensor<T, RANK>::From(slice_tensor, slice_dims);
+
+  // Step 1: Set the value of out at `_index` to zero
+  slice_e.device(eigen_place) = slice_e.constant(T(0));
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+
+  for (size_t i = 0; i < RANK; ++i) {
+    starts_indices[i] = 0;
+    ends_indices[i] = slice_dims[i];
+    strides_indices[i] = 1;
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int axis_index = axes[i];
+    starts_indices[axis_index] = starts_local[i];
+    ends_indices[axis_index] = ends_local[i];
+    strides_indices[axis_index] = steps_local[i];
+    if (starts_local[i] ==
+        ends_local[i]) {  // slice is empty, data will not be changed
+      return;
+    }
+  }
+
+  out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 2: Set a tensor with the same shape as out tensor. And its data at
+  // '_index' is the same as value, and data out of '_index' to zero
+
+  // - Step 2.1 Set slice tensor with value
+
+  // NOTE(liym27): [ Why resize slice_tensor here? ]
+  // A: When do broadcasting on slice_tensor and value, the shape of
+  // slice_tensor should be decreased dims.
+  // e.g.
+  //  x[:,0] = value
+  // x's shape = [3, 4], value's shape = [3]
+  // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+  // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+  // shape is [3, 3], which cross the border;
+  // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+  // is [3], which is right.
+
+  slice_tensor.Resize(slice_dims_for_assign);
+  CheckIsDimsMatch(slice_dims_for_assign, value.dims());
+  // ElementwiseComputeEx can do broadcasting
+  funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+      dev_ctx,
+      slice_tensor,
+      value,
+      -1,
+      funcs::SubtractFunctor<T>(),
+      &slice_tensor);
+
+  slice_tensor.Resize(slice_dims);
+
+  // - Step 2.2 Pad slice tensor with 0
+  pad_e.device(eigen_place) = pad_e.constant(T(0));
+  pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 3: Set out tensor with value
+  out_e.device(eigen_place) = out_e - pad_e;
+}
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out) {
+  const int rank = x.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueImpl<T, Context, 1>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 2:
+      SetValueImpl<T, Context, 2>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 3:
+      SetValueImpl<T, Context, 3>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 4:
+      SetValueImpl<T, Context, 4>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 5:
+      SetValueImpl<T, Context, 5>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 6:
+      SetValueImpl<T, Context, 6>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.", rank));
+  }
+}
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out) {
+  std::vector<T> assgin_values;
+  assgin_values.reserve(values.size());
+  for (const auto& val : values) {
+    assgin_values.push_back(val.to<T>());
+  }
+  DenseTensor value_tensor = Empty<T>(dev_ctx, shape);
+  paddle::framework::TensorFromVector(assgin_values, dev_ctx, &value_tensor);
+  value_tensor.Resize(phi::make_ddim(shape));
+
+  SetTensorValueKernel<T, Context>(dev_ctx,
+                                   x,
+                                   value_tensor,
+                                   starts,
+                                   ends,
+                                   steps,
+                                   axes,
+                                   decrease_axes,
+                                   none_axes,
+                                   out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/kernels/impl/shape_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..982cfb33f6b14fc14c7c58ff8c4548a4cdbd3b3b
--- /dev/null
+++ b/paddle/phi/kernels/impl/shape_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
index 6552f6ed581f45008f01c02fad3c007bf3664942..7aa43fdb7f27056d5cb4c2947e2764bd8868ff02 100644
--- a/paddle/phi/kernels/impl/softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void SoftmaxRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      int axis,
-                      DenseTensor* out) {
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   DenseTensor* out) {
   const int rank = x.dims().size();
   const int calc_axis = phi::funcs::CanonicalAxis(axis, rank);
   int axis_dim = x.dims()[calc_axis];
diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2c2720244fe8a714e0e866dd6178a1a238728b7
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Dims>
+void TileBackward(const Context& dev_ctx,
+                  const DenseTensor& out_grad,
+                  const std::vector<int>& reshape_dims_vec,
+                  const std::vector<int>& reduce_dims_vec,
+                  DenseTensor* x_grad) {
+  size_t reshape_size = reshape_dims_vec.size();
+  size_t reduce_size = reduce_dims_vec.size();
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
+  for (size_t i = 0; i < reshape_size; ++i) {
+    reshape_dims[i] = reshape_dims_vec[i];
+  }
+  Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
+  for (size_t i = 0; i < reduce_size; ++i) {
+    reduce_dims[i] = reduce_dims_vec[i];
+  }
+
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+      place, eigen_x_grad, eigen_out_grad, reduce_dims, reshape_dims);
+}
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad) {
+  auto x_dims = x.dims();
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto repeat_times_data = repeat_times.GetData();
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  // 1. reshape_dims_vec is the broadcast parameter.
+  // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+  //    each dimension expanded, the gradients should be summed to original
+  //    size.
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times_data[i]);
+    reshape_dims_vec.push_back(vec_x_dims[i]);
+  }
+
+  int dims = reduce_dims_vec.size();
+
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times_data.size(); i++) {
+    if (repeat_times_data[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    dev_ctx.template Alloc<T>(x_grad);
+
+    paddle::framework::TensorCopy(
+        out_grad, dev_ctx.GetPlace(), dev_ctx, x_grad);
+    // TensorCopy may change the dims of dx
+    x_grad->Resize(x_dims);
+  } else {
+    PADDLE_ENFORCE_GE(dims,
+                      1,
+                      errors::InvalidArgument(
+                          "Th rank of the input 'Out@GRAD' for tile_grad op "
+                          " must be greater than or equal to 1, but "
+                          "the value received is %d.",
+                          dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+    switch (dims) {
+      case 1:
+        TileBackward<Context, T, 1>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 2:
+        TileBackward<Context, T, 2>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 3:
+        TileBackward<Context, T, 3>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 4:
+        TileBackward<Context, T, 4>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 5:
+        TileBackward<Context, T, 5>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      case 6:
+        TileBackward<Context, T, 6>(
+            dev_ctx, out_grad, reshape_dims_vec, reduce_dims_vec, x_grad);
+        break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But "
+            "received tensor's rank = %d.",
+            dims));
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bafbbde4e680daa04a62ba9ab6b03778b4477107
--- /dev/null
+++ b/paddle/phi/kernels/impl/tile_kernel_impl.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/tile_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+void Tile(const Context& dev_ctx,
+          const DenseTensor& x,
+          std::vector<int64_t> repeat_times,
+          DenseTensor* out) {
+  auto x_dims = x.dims();
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times[i]));
+  }
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  if (repeat_times.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  PADDLE_ENFORCE_EQ(
+      repeat_times.size(),
+      vec_x_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_x_dims.size(),
+          repeat_times.size()));
+
+  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    bcast_dims[i] = repeat_times[i];
+  }
+
+  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim out_dims(new_x_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+
+  out->Resize(out_dims);
+  auto eigen_x = EigenTensor<T, Rank>::From(x, new_x_dims);
+  dev_ctx.template Alloc<T>(out);
+
+  auto eigen_out = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *dev_ctx.eigen_device();
+  // use 32-bit index to speed up
+  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+  if (use_32bit_index) {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, To32BitIndex(eigen_out), To32BitIndex(eigen_x), bcast_dims);
+  } else {
+    funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+        place, eigen_out, eigen_x, bcast_dims);
+  }
+}
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out) {
+  auto rank = x.dims().size();
+  auto& repeat_times_data = repeat_times.GetData();
+  int repeat_times_size = repeat_times_data.size();
+  rank = std::max(rank, repeat_times_size);
+
+  switch (rank) {
+    case 1:
+      Tile<Context, T, 1>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 2:
+      Tile<Context, T, 2>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 3:
+      Tile<Context, T, 3>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 4:
+      Tile<Context, T, 4>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 5:
+      Tile<Context, T, 5>(dev_ctx, x, repeat_times_data, out);
+      break;
+    case 6:
+      Tile<Context, T, 6>(dev_ctx, x, repeat_times_data, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b1e4b1d3a65d5c0da831a36152cff85a3353fa3
--- /dev/null
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy) {
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+
+  ScalarArray y_bst_dims_array(y_bst_dims_vec);
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims_array);
+  if (dy) {
+    // calculate x's conjugate for complex
+    DenseTensor x_conj;
+    x_conj.Resize(x.dims());
+
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, x.numel());
+    phi::funcs::ConjFunctor<T> x_functor(
+        x.data<T>(), x.numel(), dev_ctx.template Alloc<T>(&x_conj));
+    x_for_range(x_functor);
+
+    // reuse forward to get dy_bst, and the result has been broadcated already.
+    TriangularSolveKernel<T, Context>(
+        dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst);
+
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+    if (dy_bst.dims() == y.dims()) {
+      Copy<Context>(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dy_bst, dy);
+      dy->Resize(y.dims());
+    }
+  }
+
+  ScalarArray x_bst_dims_array(x_bst_dims_vec);
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+  if (dx) {
+    // calculate x's conjugate for complex
+    DenseTensor out_conj;
+    out_conj.Resize(out.dims());
+
+    phi::funcs::ForRange<Context> out_for_range(dev_ctx, out.numel());
+    phi::funcs::ConjFunctor<T> out_functor(
+        out.data<T>(), out.numel(), dev_ctx.template Alloc<T>(&out_conj));
+    out_for_range(out_functor);
+
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+    if (transpose) {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
+      blas.MatMul(out_conj,
+                  mat_dim_a,
+                  dy_bst,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    } else {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
+      blas.MatMul(dy_bst,
+                  mat_dim_a,
+                  out_conj,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    }
+
+    // get upper or lower triangular
+    DenseTensor dx_bst_upper =
+        phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+
+    const auto& dims = dx_bst.dims();
+    const auto H = dims[dims.size() - 2];
+    const auto W = dims[dims.size() - 1];
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
+    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+        dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
+    x_for_range(tril_triu_functor);
+
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+    if (dx_bst.dims() == x.dims()) {
+      Copy<Context>(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dx_bst_upper, dx);
+      dx->Resize(x.dims());
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26c2f978005f292ed6960017dbcd5a2f68d4c9b1
--- /dev/null
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/is_empty_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   DenseTensor* out) {
+  // Note: is_empty is always executed on CPU and the output data should
+  // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
+  // avoid the unnecessary data transform.
+  bool* out_data = dev_ctx.template HostAlloc<bool>(out);
+  out_data[0] = phi::product(x.dims()) == 0;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(is_empty,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(is_empty,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.h b/paddle/phi/kernels/is_empty_kernel.h
similarity index 79%
rename from paddle/infrt/kernel/phi/allocator_kernels.h
rename to paddle/phi/kernels/is_empty_kernel.h
index d10382f5e6014c2b04dab65c8439d99e4563aaef..3bcf6f9054ed50a81f9b237e754d7d8aed51353a 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.h
+++ b/paddle/phi/kernels/is_empty_kernel.h
@@ -14,15 +14,11 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator();
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/phi/kernels/isclose_kernel.h b/paddle/phi/kernels/isclose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c468da055082005568002952f695ebedda31c3c
--- /dev/null
+++ b/paddle/phi/kernels/isclose_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/phi/kernels/isfinite_kernel.h
similarity index 54%
rename from paddle/fluid/operators/matrix_power_op.cu
rename to paddle/phi/kernels/isfinite_kernel.h
index d972e9499dc88444e2addc9c9082d9e6fd496e08..e695a8e0742235a1da3fa829f82fd96c05fada34 100644
--- a/paddle/fluid/operators/matrix_power_op.cu
+++ b/paddle/phi/kernels/isfinite_kernel.h
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
+#include "paddle/phi/core/dense_tensor.h"
 
-REGISTER_OP_CUDA_KERNEL(matrix_power,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, float>,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, double>);
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, float>,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, double>);
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel) \
+  template <typename T, typename Context>       \
+  void isfinite_kernel(                         \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel)
+DEFINE_ISFINITE_KERNEL(IsnanKernel)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_grad_kernel.h b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f53898fa6816d198d1bc96bcf99f20752a70551
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+// XKTODO (change name)
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..103780ab747282f9171d2a409db9fd4e5269ba4d
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kron_grad_kernel.h b/paddle/phi/kernels/kron_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3daa9dcfba9f0d89bd8dec88905f0ddb321f630a
--- /dev/null
+++ b/paddle/phi/kernels/kron_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KronGradKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out_grad,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kron_kernel.h b/paddle/phi/kernels/kron_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4451ac757a9534f4a48db97da81acc2047c26be2
--- /dev/null
+++ b/paddle/phi/kernels/kron_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KronKernel(const Context& ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/lgamma_grad_kernel.h b/paddle/phi/kernels/lgamma_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..94173cc29c7a7b7c71b6c781fbe06f9c991a4197
--- /dev/null
+++ b/paddle/phi/kernels/lgamma_grad_kernel.h
@@ -0,0 +1,27 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/lgamma_kernel.h b/paddle/phi/kernels/lgamma_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f61b3a1ce859eeee77d49ecb38ee0e96c3a9f0ee
--- /dev/null
+++ b/paddle/phi/kernels/lgamma_kernel.h
@@ -0,0 +1,26 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/linspace_kernel.h b/paddle/phi/kernels/linspace_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca2b940aef965b6772f113a6a96f84d64fb3d1a2
--- /dev/null
+++ b/paddle/phi/kernels/linspace_kernel.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_loss_grad_kernel.h b/paddle/phi/kernels/log_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6853140b19b907e0dc427e14b88f77726157612f
--- /dev/null
+++ b/paddle/phi/kernels/log_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       float epsilon,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_loss_kernel.h b/paddle/phi/kernels/log_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd16c0f2c7ce2cb4711250c44fd3081e3b05e867
--- /dev/null
+++ b/paddle/phi/kernels/log_loss_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   float epsilon,
+                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 480eb56c8b05c12c36337d4649a17b3b03146fdf..a5d3f51e5447fa41447c4b59c3beb8c917f8a0e5 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -197,7 +197,8 @@ PD_REGISTER_KERNEL(subtract,
                    int64_t,
                    phi::dtype::float16,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(divide,
                    GPU,
                    ALL_LAYOUT,
@@ -207,6 +208,7 @@ PD_REGISTER_KERNEL(divide,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(multiply,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index 342393d79bd4d3729afdae45b423b50827a37d61..7569cbcff087d796313c24d46ff7b7fd9cf7e2eb 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -109,7 +109,7 @@ template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -120,7 +120,7 @@ template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -131,7 +131,7 @@ template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -142,7 +142,7 @@ template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -154,9 +154,9 @@ DenseTensor Mean(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& axis,
                  bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
@@ -167,7 +167,7 @@ DenseTensor Sum(const Context& dev_ctx,
                 const std::vector<int64_t>& axis,
                 DataType dtype,
                 bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
   SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index 8fc060d2e3dbcba93658021f5bfe295da58a3eb6..b524b9e5863dcbcacaea11df9a96b71570312213 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -33,9 +33,9 @@ template <typename T, typename Context>
 DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
-                   bool transpose_x,
-                   bool transpose_y) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+                   bool transpose_x = false,
+                   bool transpose_y = false) {
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out);
   MatmulKernel<T, Context>(dev_ctx, x, y, transpose_x, transpose_y, &dense_out);
diff --git a/paddle/phi/kernels/matrix_power_grad_kernel.h b/paddle/phi/kernels/matrix_power_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f70cf6e34d491bacf00c575bc28c1d7a39f859a
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_kernel.h b/paddle/phi/kernels/matrix_power_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..39a1bc85e3fe77e79ef8172f1bc4a22029b83d9d
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_rank_kernel.h b/paddle/phi/kernels/matrix_rank_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6edea2723e589340f2c6dc3cfb0be6f895bf08bb
--- /dev/null
+++ b/paddle/phi/kernels/matrix_rank_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float tol,
+                      bool use_default_tol,
+                      bool hermitian,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_rank_tol_kernel.h b/paddle/phi/kernels/matrix_rank_tol_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..351358dfa04aa7ad2091ad0e01bc63e50046eda0
--- /dev/null
+++ b/paddle/phi/kernels/matrix_rank_tol_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixRankTolKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& atol_tensor,
+                         bool use_default_tol,
+                         bool hermitian,
+                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_grad_kernel.h b/paddle/phi/kernels/maxout_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ee4e8cc89676f49fa84e3a6e481e8e5d687614e
--- /dev/null
+++ b/paddle/phi/kernels/maxout_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_kernel.h b/paddle/phi/kernels/maxout_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e582575678d4d17d3f9d55183e58043fc6444225
--- /dev/null
+++ b/paddle/phi/kernels/maxout_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6d8ecd744e12a9b17bcd954eaf19093d8c694df
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_kernel.h b/paddle/phi/kernels/multi_dot_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..09866e8dde5eaa8adc773a481c98178da8ffed9e
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..127dc2f961f101e32a11ebea6bdfe4556546b8aa
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& label,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b271f0f4d06a05abd2c4b60556a1fb1755fef1bf
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   paddle::optional<const DenseTensor&> weight,
+                   int64_t ignore_index,
+                   const std::string& reduction,
+                   DenseTensor* out) {
+  DenseTensor total_weight;
+  total_weight.set_meta(
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(), {1}));
+  dev_ctx.template Alloc<T>(total_weight);
+  NllLossRawKernel(dev_ctx,
+                   input,
+                   label,
+                   weight,
+                   ignore_index,
+                   reduction,
+                   out,
+                   &total_weight);
+}
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..90083e1d6840d32e96870c7d5402c2fc99cd6bad
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.h
@@ -0,0 +1,33 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/norm_grad_kernel.h b/paddle/phi/kernels/norm_grad_kernel.h
index 7b09d6463d08b086653cb049f4ace9218439b4f8..55714b8a4a091f6d64cbb9a03eb9043d4c2dbf22 100644
--- a/paddle/phi/kernels/norm_grad_kernel.h
+++ b/paddle/phi/kernels/norm_grad_kernel.h
@@ -20,9 +20,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void NormGradKernel(const Context& ctx,
-                    const DenseTensor& out_grad,
                     const DenseTensor& x,
                     const DenseTensor& out,
+                    const DenseTensor& out_grad,
                     int axis,
                     float epsilon,
                     bool is_test,
diff --git a/paddle/phi/kernels/one_hot_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..633f48cbb62ace9e3f7f21502bd61f8c305fb542
--- /dev/null
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/one_hot_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void OneHotKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& num_classes_s,
+                  DenseTensor* out) {
+  int num_classes = num_classes_s.to<int>();
+  OneHotRawKernel<T>(
+      dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/one_hot_kernel.h b/paddle/phi/kernels/one_hot_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f89609ea63365b0e7831201ca003d6c7320c5d7
--- /dev/null
+++ b/paddle/phi/kernels/one_hot_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void OneHotKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& num_classes,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad_grad_kernel.h b/paddle/phi/kernels/pad_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f39d87e5c0ef6503d772e5f9ee95e307a13eda13
--- /dev/null
+++ b/paddle/phi/kernels/pad_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad_kernel.h b/paddle/phi/kernels/pad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..511e8cf73df97ffb250b1106aa98155de33a97d1
--- /dev/null
+++ b/paddle/phi/kernels/pad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0658dc22c823bf7ae162fb2e392f256cfb051496
--- /dev/null
+++ b/paddle/phi/kernels/pool_grad_kernel.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pool2dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool2dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool2dDoubleGradKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool2dDoubleGradGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool ceil_mode,
+                                  bool exclusive,
+                                  const std::string& data_format,
+                                  const std::string& pooling_type,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  const std::string& padding_algorithm,
+                                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool3dGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      const std::vector<int>& kernel_size,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      bool ceil_mode,
+                      bool exclusive,
+                      const std::string& data_format,
+                      const std::string& pooling_type,
+                      bool global_pooling,
+                      bool adaptive,
+                      const std::string& padding_algorithm,
+                      DenseTensor* dx);
+
+template <typename T, typename Context>
+void Pool3dGradGPUDNNKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            const std::vector<int>& kernel_size,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            const std::string& data_format,
+                            const std::string& pooling_type,
+                            bool global_pooling,
+                            bool adaptive,
+                            const std::string& padding_algorithm,
+                            DenseTensor* dx);
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexGradKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& mask,
+                                  const DenseTensor& dout,
+                                  const std::vector<int>& kernel_size,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  bool global_pooling,
+                                  bool adaptive,
+                                  DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..348af021815175ca2c6c94b9721fec33fbaf864c
--- /dev/null
+++ b/paddle/phi/kernels/pool_kernel.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pool2dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool2dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool2dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask);
+
+template <typename T, typename Context>
+void Pool3dKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_size,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  bool ceil_mode,
+                  bool exclusive,
+                  const std::string& data_format,
+                  const std::string& pooling_type,
+                  bool global_pooling,
+                  bool adaptive,
+                  const std::string& padding_algorithm,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void Pool3dGPUDNNKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const std::vector<int>& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxPool3dWithIndexKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              const std::vector<int>& kernel_size,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              bool global_pooling,
+                              bool adaptive,
+                              DenseTensor* out,
+                              DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 19427551fb3f0e55f1ad4302bdb687bdf54e920d..632ad00f6d06ed8a02b2d9677ff665c677cf8cb9 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,6 +22,7 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index 830bc1972c49fe8c447e9a13f874841d36a12f2d..b5a1e88acc32b1b101a6f81b750be1c669236a1a 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
+
+// macro
 #ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
@@ -22,11 +25,6 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
-
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -42,11 +40,8 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
+
 #else
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
@@ -67,4 +62,22 @@
 #define GRID_NUM_X gridDim.x
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
+
+#endif
+
+// include file
+#ifdef PADDLE_WITH_XPU_KP
+
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
+#else
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
 #endif
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..87163eb8e079ffd580d6f937179e24a8506376e9
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& rois,
+                         paddle::optional<const DenseTensor&> rois_num,
+                         const DenseTensor& dout,
+                         int pooled_height,
+                         int pooled_width,
+                         int output_channels,
+                         float spatial_scale,
+                         DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..341037af2caeca28e211da8862e3c8d6089b9bac
--- /dev/null
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PsroiPoolKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& rois,
+                     paddle::optional<const DenseTensor&> rois_num,
+                     int pooled_height,
+                     int pooled_width,
+                     int output_channels,
+                     float spatial_scale,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2141443da7ab17776a72572fd7634bbf822a7f66
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..797d0e364b48d46372992590f7807b4c6af5f242
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3cbd0976ad8d238be7462f20165c919df01a80ea
--- /dev/null
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_all_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_all_kernel.h b/paddle/phi/kernels/reduce_all_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d7a9ab3faf39c49dd70213ec3edfa98b6e4e406
--- /dev/null
+++ b/paddle/phi/kernels/reduce_all_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..371dd972129cc8fcf5f0e390f18749f8c5ad7f75
--- /dev/null
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_any_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.h b/paddle/phi/kernels/reduce_any_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f505817084e792a45c626430eb4e3d7d5a485aa
--- /dev/null
+++ b/paddle/phi/kernels/reduce_any_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de172a12d72884fb018acbb42c077efc825508ce
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a350519c506b15a54d41b969dc65b679cc4d06
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8ec6b3678c58d38d19853c04128283d979f50de
--- /dev/null
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_min_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_min_kernel.h b/paddle/phi/kernels/reduce_min_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3227ec00e649e520e455fc3b2122cb88b51fc13e
--- /dev/null
+++ b/paddle/phi/kernels/reduce_min_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/reduce_sum_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab4d63297efffc70710e496efa08f4b9c7e5f7ce
--- /dev/null
+++ b/paddle/phi/kernels/reduce_sum_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h
index 1a3d0db8a8a3b8668431ced5b95480f5d4758566..848f162a2a881ddc4d4ea136313216fd569accfd 100644
--- a/paddle/phi/kernels/reshape_kernel.h
+++ b/paddle/phi/kernels/reshape_kernel.h
@@ -38,7 +38,7 @@ template <typename T, typename Context>
 DenseTensor Reshape(const Context& dev_ctx,
                     const DenseTensor& x,
                     const std::vector<int64_t>& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   InferMetaFromVecValue(x, shape, &meta_out);
   ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..16b52c563a592f0cc23ddca94f554f5dc49e8ccf
--- /dev/null
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ROIAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    paddle::optional<const DenseTensor&> boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h
index b395834369851e7491ecb592895dd5b9ad13aabd..7537dc1130b83ea3963ae128bf8ce3859411c199 100644
--- a/paddle/phi/kernels/scale_kernel.h
+++ b/paddle/phi/kernels/scale_kernel.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 namespace phi {
@@ -29,21 +28,13 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
-template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out);
-
 template <typename T, typename Context>
 DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ScaleKernel<T, Context>(
diff --git a/paddle/phi/kernels/scatter_grad_kernel.h b/paddle/phi/kernels/scatter_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf1482fca7f667eae530e09a95df0020186aef77
--- /dev/null
+++ b/paddle/phi/kernels/scatter_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterGradKernel(const Context &ctx,
+                       const DenseTensor &index,
+                       const DenseTensor &updates,
+                       const DenseTensor &out_grad,
+                       bool overwrite,
+                       DenseTensor *x_grad,
+                       DenseTensor *updates_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_kernel.h b/paddle/phi/kernels/scatter_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5191d6bce45f26c69d5a2fe61c23f71039c2433c
--- /dev/null
+++ b/paddle/phi/kernels/scatter_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterKernel(const Context &ctx,
+                   const DenseTensor &x,
+                   const DenseTensor &index,
+                   const DenseTensor &updates,
+                   bool overwrite,
+                   DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_nd_add_grad_kernel.h b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcfdb2cdb2f09e53ed9da1bc3995587737f9492c
--- /dev/null
+++ b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddGradKernel(const Context &ctx,
+                            const DenseTensor &index,
+                            const DenseTensor &updates,
+                            const DenseTensor &out_grad,
+                            DenseTensor *x_grad,
+                            DenseTensor *updates_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/scatter_nd_add_kernel.h b/paddle/phi/kernels/scatter_nd_add_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c20709dccc08c16650dd1fc00ec10a91c333e13f
--- /dev/null
+++ b/paddle/phi/kernels/scatter_nd_add_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ScatterNdAddKernel(const Context &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &index,
+                        const DenseTensor &updates,
+                        DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/searchsorted_kernel.h b/paddle/phi/kernels/searchsorted_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e425c7fd7955544cc429fb0a071e8f8038b47063
--- /dev/null
+++ b/paddle/phi/kernels/searchsorted_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SearchsortedKernel(const Context& ctx,
+                        const DenseTensor& sorted_sequence,
+                        const DenseTensor& value,
+                        bool out_int32,
+                        bool right,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e773eed16e8c838d9f70587dbcc6dee50c057424
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_kernel.h b/paddle/phi/kernels/segment_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f7b30c2e8603d1b59c8217ef3e9e21d072bf1d0
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 02231867fdd35cc8db4359fa3cc31d6236229afc..39fd009cd6586bd7413e9d215607603f5d0d2b0a 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -12,34 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out) {
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out) {
   phi::FullKernel<T>(dev_ctx, shape, val, dtype, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
@@ -56,7 +59,7 @@ PD_REGISTER_KERNEL(full_sr,
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d84ddcc0d3f63bdc95db97a933f53076baa8a02c
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/full_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a507cdd0d866c3677d74956d1146139e6f2f92c2
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& dev_ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x.value(), out->mutable_value());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(isinf_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.h b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..948d8c89477a25dd24f8edca374701b42c8de5e4
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+#define DEFINE_ISFINITE_SR(isfinite_sr)   \
+  template <typename T, typename Context> \
+  void isfinite_sr(                       \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out);
+
+DEFINE_ISFINITE_SR(IsinfSR)
+DEFINE_ISFINITE_SR(IsnanSR)
+DEFINE_ISFINITE_SR(IsfiniteSR)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c53abdf996c477d097cc58ac36d944944cbcab9b
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out);
+
+#define DEFINE_ISFINITE_SR(isfinite_sr, functor)                      \
+  template <typename T, typename Context>                             \
+  void isfinite_sr(                                                   \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out) { \
+    IsfiniteSRImpl<T, Context, functor>(ctx, x, out);                 \
+  }
+
+DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor)
+DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 094b6f4d12022be07910bac68f09d201040b364a..38a0cb75101b7ea5e3f423aa997e003977c56030 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/phi/kernels/selected_rows/scale_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out) {
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out) {
   if (x.value().Holder() != out->value().Holder() ||
       x.value().data() != out->value().data()) {
     out->set_rows(x.rows());
@@ -36,12 +38,13 @@ void ScaleSR(const Context& dev_ctx,
       dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::bfloat16,
@@ -55,7 +58,7 @@ PD_REGISTER_KERNEL(scale_sr,
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.h b/paddle/phi/kernels/selected_rows/scale_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..85c2c4ddff0333cdd0603f2fb4235b17bbdaffad
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bcd5d8544e2d73961d72115023446d427e8895e
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out) {
+  auto in_var = input;
+  phi::DDim in_dims;
+  in_dims = in_var.value().dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(shape_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+#endif
diff --git a/paddle/fluid/platform/dynload/lapack.cc b/paddle/phi/kernels/selected_rows/shape_kernel.h
similarity index 60%
rename from paddle/fluid/platform/dynload/lapack.cc
rename to paddle/phi/kernels/selected_rows/shape_kernel.h
index 5a21bb4d041d9b02897b81fd9af8fb58983a7838..86ba52982b5967481f50cb661b15c84ed3751edd 100644
--- a/paddle/fluid/platform/dynload/lapack.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/dynload/lapack.h"
+#pragma once
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+#include "paddle/phi/core/selected_rows.h"
 
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
+namespace phi {
+namespace sr {
 
-LAPACK_ROUTINE_EACH(DEFINE_WRAP);
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out);
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
index 881180b71b151aa48a16dcd15871d4a7cd656fb7..b3dd1d1b7d2a025a80123f480a9deeb0f7ca5932 100644
--- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
@@ -12,22 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/kernels/selected_rows/uniform_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/uniform_random_kernel.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void UniformRandomRawSRKernel(const Context& dev_ctx,
-                              const ScalarArray& shape,
-                              DataType dtype,
-                              float min,
-                              float max,
-                              int seed,
-                              int diag_num,
-                              int diag_step,
-                              float diag_val,
-                              SelectedRows* out) {
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out) {
   phi::UniformRandomRawKernel<T>(dev_ctx,
                                  shape,
                                  dtype,
@@ -41,23 +46,24 @@ void UniformRandomRawSRKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void UniformRandomSRKernel(const Context& dev_ctx,
-                           const ScalarArray& shape,
-                           DataType dtype,
-                           float min,
-                           float max,
-                           int seed,
-                           SelectedRows* out) {
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out) {
   phi::UniformRandomKernel<T>(
       dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(uniform_random_raw_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomRawSRKernel,
+                   phi::sr::UniformRandomRawKernel,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
@@ -65,7 +71,7 @@ PD_REGISTER_KERNEL(uniform_random_raw_sr,
 PD_REGISTER_KERNEL(uniform_random_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomSRKernel,
+                   phi::sr::UniformRandomKernel,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
@@ -75,14 +81,14 @@ PD_REGISTER_KERNEL(uniform_random_sr,
 PD_REGISTER_KERNEL(uniform_random_raw_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomRawSRKernel,
+                   phi::sr::UniformRandomRawKernel,
                    float,
                    double) {}
 
 PD_REGISTER_KERNEL(uniform_random_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomSRKernel,
+                   phi::sr::UniformRandomKernel,
                    float,
                    double) {}
 #endif
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..aee7a4c7aaf62d9250246f326ca009268af92d5e
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a028b0c8dc50fb068de1ded367990c409bd45cb
--- /dev/null
+++ b/paddle/phi/kernels/set_value_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetValueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out_grad,
+                        const ScalarArray& starts,
+                        const ScalarArray& ends,
+                        const ScalarArray& steps,
+                        const std::vector<int64_t>& axes,
+                        const std::vector<int64_t>& decrease_axes,
+                        const std::vector<int64_t>& none_axes,
+                        DenseTensor* x_grad,
+                        DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/set_value_kernel.h b/paddle/phi/kernels/set_value_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..271691b1a3596f773cb67d605c76d6b8f142c1b6
--- /dev/null
+++ b/paddle/phi/kernels/set_value_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.cc b/paddle/phi/kernels/shape_kernel.h
similarity index 75%
rename from paddle/infrt/kernel/phi/allocator_kernels.cc
rename to paddle/phi/kernels/shape_kernel.h
index eba12e688b4ae2cf9bdd4fa46bb479be882b02fc..444c481812e88dfb6413d8af51dca4f1b704efbf 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.cc
+++ b/paddle/phi/kernels/shape_kernel.h
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/phi/kernels/shard_index_kernel.h b/paddle/phi/kernels/shard_index_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..54ad9a14fa023e41d5261cfe9f03039a1fa031ec
--- /dev/null
+++ b/paddle/phi/kernels/shard_index_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShardIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      int index_num,
+                      int nshards,
+                      int shard_id,
+                      int ignore_value,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bc75b7670fcc2100908af2543455964be324b54
--- /dev/null
+++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ea3e6589f7ed070eb6229a24fb75e7444e94151
--- /dev/null
+++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sign_kernel.h b/paddle/phi/kernels/sign_kernel.h
index 7ee1145012dbd94d077bc229e0ffb8e833eb52a4..4b5900d90f45daa01c117b9f1649a152734c5b76 100644
--- a/paddle/phi/kernels/sign_kernel.h
+++ b/paddle/phi/kernels/sign_kernel.h
@@ -25,7 +25,7 @@ void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   SignKernel<T, Context>(dev_ctx, x, &dense_out);
diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h
index ca69d652770aacd01191f5c3ca685276f0f2336f..4edd562ca885301b02b8ecc737c8590831e3cac4 100644
--- a/paddle/phi/kernels/softmax_kernel.h
+++ b/paddle/phi/kernels/softmax_kernel.h
@@ -19,20 +19,10 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename T, typename Context>
-void SoftmaxRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void SoftmaxKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    int axis,
-                   DataType dtype,
-                   DenseTensor* out) {
-  auto cast_x = phi::Cast<T, Context>(dev_ctx, x, dtype);
-  phi::SoftmaxRawKernel<T, Context>(dev_ctx, axis, out);
-}
+                   DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index 3e4a968b7a8a569fc518366c321a57e2738bf12a..eaea6d952167c149f8add498768b26dd0d54f16a 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index 1a6ac852448a5f4a25248d2a2b6919a301a04874..42bde442e1e063a355d2eabb2963865a2ff45bcb 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -30,6 +32,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad);
 
@@ -42,9 +45,13 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const std::vector<int>& paddings,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
-                                    const int groups) {
-  DenseTensor x_grad = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor kernel_grad = phi::Empty<T, Context>(dev_ctx);
+                                    const int groups,
+                                    const bool subm) {
+  DenseTensor x_grad =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  DenseTensor kernel_grad = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout()));
+  // TODO(zhangkaihuo): call InferMeta func here
   Conv3dGradKernel<T, Context>(dev_ctx,
                                x,
                                rulebook,
@@ -54,6 +61,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                dilations,
                                strides,
                                groups,
+                               subm,
                                &x_grad,
                                &kernel_grad);
   std::vector<DenseTensor> out(2);
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 71160a6365dc778e40476af960f21443cac698e5..ff2cf94edb5a378b5d43d569a869fcd705ef12bd 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,108 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
 
 namespace phi {
 namespace sparse {
 
-struct Dims4D {
-  int dims[4];
-  Dims4D(const int batch, const int x, const int y, const int z) {
-    dims[0] = batch;
-    dims[1] = z;
-    dims[2] = y;
-    dims[3] = x;
-  }
-  HOSTDEVICE const int& operator[](int i) const { return dims[i]; }
-};
-
-// Judge whether the current position x is in (lower, upper)
-inline HOSTDEVICE bool Check(const int& x,
-                             const int& kx,
-                             const int& pad,
-                             const int& stride,
-                             const int dilation,
-                             const int kdim,
-                             const int xdim) {
-  const int lower = x - dilation * kx + pad;
-  const int uper = x + (kdim - kx - 1) * dilation - pad;
-  return (lower >= 0 && lower % stride == 0 && uper < xdim);
-}
-
-// Check whether the current position(x, y, z) is legal:
-// Judge the minimum and maximum values at each latitude
-inline HOSTDEVICE bool Check(const Dims4D& dims,
-                             const Dims4D& kernel_dims,
-                             const Dims4D& paddings,
-                             const Dims4D& dilations,
-                             const Dims4D& strides,
-                             const int x,
-                             const int y,
-                             const int z,
-                             const int kx,
-                             const int ky,
-                             const int kz) {
-  bool x_valid = Check(
-      x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]);
-  bool y_valid = Check(
-      y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]);
-  bool z_valid = Check(
-      z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]);
-  return (x_valid && y_valid && z_valid);
-}
-
-template <typename Dim>
-inline HOSTDEVICE int PointToIndex(const int& batch,
-                                   const int& x,
-                                   const int& y,
-                                   const int& z,
-                                   const Dim& dims) {
-  return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
-         y * dims[3] + x;
-}
-
-template <typename Dim>
-inline HOSTDEVICE void IndexToPoint(
-    const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
-  int n = index;
-  *x = n % dims[3];
-  n /= dims[3];
-  *y = n % dims[2];
-  n /= dims[2];
-  *z = n % dims[1];
-  n /= dims[1];
-  *batch = n;
-}
-
-inline void GetOutShape(const DDim& x_dims,
-                        const DDim& kernel_dims,
-                        const std::vector<int>& paddings,
-                        const std::vector<int>& dilations,
-                        const std::vector<int>& strides,
-                        DDim* out_dims) {
-  PADDLE_ENFORCE_EQ(
-      x_dims.size(),
-      5,
-      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
-  PADDLE_ENFORCE_EQ(kernel_dims.size(),
-                    5,
-                    phi::errors::InvalidArgument(
-                        "the shape of kernel should be (D, H, W, C, OC)"));
-
-  // infer out shape
-  (*out_dims)[0] = x_dims[0];
-  (*out_dims)[4] = kernel_dims[4];
-  for (int i = 1; i < 4; i++) {
-    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
-                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
-                         strides[i - 1] +
-                     1;
-  }
-}
-
 template <typename T, typename Context>
 void Conv3dKernel(const Context& dev_ctx,
                   const SparseCooTensor& x,
@@ -124,6 +31,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook);
 
@@ -135,12 +43,23 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
+                       const bool subm,
                        DenseTensor* rulebook) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
-  Conv3dKernel<T, Context>(
-      dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
+  Conv3dKernel<T, Context>(dev_ctx,
+                           x,
+                           kernel,
+                           paddings,
+                           dilations,
+                           strides,
+                           groups,
+                           subm,
+                           &coo,
+                           rulebook);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index ab2fef5320f716b6bc780ad14b8e2adef44427dd..64c32df18971c4d66873b02a61220a5bed8db005 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -16,8 +16,6 @@ limitations under the License. */
 
 #include <set>
 
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
@@ -28,6 +26,8 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
+using Dims4D = phi::funcs::sparse::Dims4D;
+
 // such as: kernel(3, 3, 3), kernel_size = 27
 // counter_per_weight: (kernel_size)
 // TODO(zhangkaihuo): optimize performance with multithreading
@@ -39,15 +39,13 @@ void ProductRuleBook(const Context& dev_ctx,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      const DDim& out_dims,
+                     const bool subm,
                      DenseTensor* rulebook,
                      DenseTensor* counter_per_kernel) {
   const auto& kernel_dims = kernel.dims();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
-  dev_ctx.Alloc(counter_per_kernel,
-                counter_per_kernel->dtype(),
-                sizeof(int) * counter_per_kernel->numel());
   int* counter_ptr = counter_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
@@ -62,11 +60,25 @@ void ProductRuleBook(const Context& dev_ctx,
   const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
   const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
 
+  std::set<int> hash_in;
+  if (subm) {
+    for (int i = 0; i < non_zero_num; i++) {
+      int batch = indices_ptr[i];
+      int in_z = indices_ptr[i + non_zero_num];
+      int in_y = indices_ptr[i + 2 * non_zero_num];
+      int in_x = indices_ptr[i + 3 * non_zero_num];
+      int index = phi::funcs::sparse::PointToIndex<DDim>(
+          batch, in_x, in_y, in_z, x_dims);
+      hash_in.insert(index);
+    }
+  }
+
   auto f_calc_rulebook = [&](int* rulebook_ptr) {
     int kernel_index = 0, rulebook_index = 0;
     for (int kz = 0; kz < kernel_dims[0]; kz++) {
       for (int ky = 0; ky < kernel_dims[1]; ky++) {
         for (int kx = 0; kx < kernel_dims[2]; kx++) {
+          ++kernel_index;
           for (int64_t i = 0; i < non_zero_num; i++) {
             int batch = indices_ptr[i];
             int in_z = indices_ptr[i + non_zero_num];
@@ -75,31 +87,38 @@ void ProductRuleBook(const Context& dev_ctx,
             int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
             int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
             int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
-            if (Check(c_x_dims,
-                      c_kernel_dims,
-                      c_paddings,
-                      c_dilations,
-                      c_strides,
-                      in_x,
-                      in_y,
-                      in_z,
-                      kx,
-                      ky,
-                      kz)) {
+            if (phi::funcs::sparse::Check(c_x_dims,
+                                          c_kernel_dims,
+                                          c_paddings,
+                                          c_dilations,
+                                          c_strides,
+                                          in_x,
+                                          in_y,
+                                          in_z,
+                                          kx,
+                                          ky,
+                                          kz)) {
+              if (subm) {
+                int out_index = phi::funcs::sparse::PointToIndex<DDim>(
+                    batch, out_x, out_y, out_z, out_dims);
+                if (hash_in.find(out_index) == hash_in.end()) {
+                  continue;
+                }
+              }
+
               if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index] += 1;
+                counter_ptr[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
-                rulebook_ptr[rulebook_index] = kernel_index;
+                rulebook_ptr[rulebook_index] = kernel_index - 1;
                 rulebook_ptr[rulebook_index + rulebook_len] = i;  // in_i
                 rulebook_ptr[rulebook_index + rulebook_len * 2] =
-                    PointToIndex<DDim>(
+                    phi::funcs::sparse::PointToIndex<DDim>(
                         batch, out_x, out_y, out_z, out_dims);  // out_index
                 ++rulebook_index;
               }
             }
           }
-          ++kernel_index;
         }
       }
     }
@@ -107,7 +126,9 @@ void ProductRuleBook(const Context& dev_ctx,
 
   f_calc_rulebook(nullptr);
   // alloc the rulebook
-  rulebook->ResizeAndAllocate({3, rulebook_len});
+  DenseTensorMeta rulebook_meta(
+      DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
+  rulebook->set_meta(rulebook_meta);
   dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
   int* rulebook_ptr = rulebook->data<int>();
   f_calc_rulebook(rulebook_ptr);
@@ -136,14 +157,12 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
       x.dtype(), {out_non_zero_num, out_channels}, x.layout());
   phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
   phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-  dev_ctx.Alloc(
-      &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int));
   int* out_indices_ptr = out_indices.data<int>();
   int i = 0;
   for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
     const int index = *it;
     int batch, x, y, z;
-    IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
+    phi::funcs::sparse::IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
     out_indices_ptr[i] = batch;
     out_indices_ptr[i + out_non_zero_num] = z;
     out_indices_ptr[i + out_non_zero_num * 2] = y;
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index d4f770ce8713aa84c7f87f0e49bf8468467ffdbf..5d7b381b7cb0beef7e69608ce7f732d8cdf9d222 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
@@ -37,6 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
+                      const bool subm,
                       DenseTensor* x_grad,
                       DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
@@ -60,47 +62,66 @@ void Conv3dGradKernel(const Context& dev_ctx,
   phi::DenseTensor out_grad_features =
       phi::Empty(dev_ctx, std::move(out_grad_features_meta));
 
-  dev_ctx.Alloc(
-      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
   T* in_features_ptr = in_features.data<T>();
-  dev_ctx.Alloc(
-      &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel());
   T* d_x_features_ptr = d_x_features.data<T>();
-  dev_ctx.Alloc(&out_grad_features,
-                out_grad_features.dtype(),
-                sizeof(T) * out_grad_features.numel());
   T* out_grad_features_ptr = out_grad_features.data<T>();
   kernel_grad->Resize(kernel_dims);
   dev_ctx.Alloc(
       kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
   T* d_kernel_ptr = kernel_grad->data<T>();
+  memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
 
-  Gather<T>(x.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len,
-            rulebook_len,
-            in_channels,
-            in_features_ptr);
-  Gather<T>(out_grad.non_zero_elements().data<T>(),
-            rulebook_ptr + rulebook_len * 2,
-            rulebook_len,
-            out_channels,
-            out_grad_features_ptr);
-
+  int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
+
   std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
   for (int i = 0; i < rulebook_len; i++) {
     counter[rulebook_ptr[i]] += 1;
   }
-  int offset = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
   }
   offsets[kernel_size] = offset;
 
+  if (subm) {
+    phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
+                                                   x,
+                                                   kernel,
+                                                   out_grad,
+                                                   in_channels,
+                                                   out_channels,
+                                                   half_kernel_size,
+                                                   kernel_grad,
+                                                   x_grad);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
@@ -141,10 +162,6 @@ void Conv3dGradKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  x_grad->Resize(x.non_zero_elements().dims());
-  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
-  T* x_grad_values_ptr = x_grad->data<T>();
-  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
   Scatter<T>(d_x_features_ptr,
              rulebook.data<int>() + rulebook_len,
              rulebook_len,
@@ -155,12 +172,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv_grad,
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::Conv3dGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-  kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index fdf255bd542e66245b44b2ec906dc207ee51a422..746ca04a826c020201e53803dd2ec83519cf576e 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,13 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
 namespace sparse {
@@ -36,6 +33,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const int groups,
+                  const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
   // update padding and dilation
@@ -46,25 +44,31 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
-  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  phi::funcs::sparse::GetOutShape(
+      x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
 
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
+        kernel.dims(), &subm_paddings, &subm_strides);
+  }
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
   DenseTensorMeta counter_meta(
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
 
   ProductRuleBook<T, Context>(dev_ctx,
                               x,
                               kernel,
-                              paddings,
+                              subm_paddings,
                               dilations,
-                              strides,
+                              subm_strides,
                               out_dims,
+                              subm,
                               rulebook,
                               &counter_per_kernel);
 
@@ -83,8 +87,6 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(in_features_meta));
   phi::DenseTensor out_features =
       phi::Empty(dev_ctx, std::move(out_features_meta));
-  dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel());
-  dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel());
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
 
@@ -130,9 +132,6 @@ void Conv3dKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  dev_ctx.Alloc(out->mutable_non_zero_elements(),
-                out->mutable_non_zero_elements()->dtype(),
-                sizeof(T) * in_features.numel());
   T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
   memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
   Scatter<T>(out_features_ptr,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index ba89135641e0e67daa84cd526d8b389953ef1862..50e95ee0b8a4876a65b8ba7d09fd2d112eac2b30 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 
 namespace phi {
 namespace sparse {
@@ -71,7 +71,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim);
 
   const auto place = dev_ctx.GetPlace();
-  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  const auto values_dims =
+      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
   DenseTensorMeta indices_meta(DataType::INT64,
                                {sparse_dim, static_cast<int64_t>(non_zero_num)},
                                DataLayout::NCHW);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8826fd7cf87e0a7a4a8251b4da823f18190f4a38
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out,
+                              const bool subm = false) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+template <typename Context>
+inline int* SortedAndUniqueIndex(const Context& dev_ctx,
+                                 const int* rulebook_ptr,
+                                 const int len,
+                                 DenseTensor* out_index,
+                                 DenseTensor* unique_key,
+                                 DenseTensor* unique_value) {
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, out_index, kps::IdentityFunctor<int>());
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, unique_value, kps::IdentityFunctor<int>());
+
+  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<int>(),
+                                     rulebook_ptr,
+                                     sizeof(int) * len,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToDevice,
+#else
+                                     cudaMemcpyDeviceToDevice,
+#endif
+                                     dev_ctx.stream());
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key->data<int>(),
+                      unique_key->data<int>() + len,
+                      out_index->data<int>());
+
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key->data<int>(),
+                            unique_key->data<int>() + len,
+                            unique_value->data<int>());
+  return new_end.first;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6d992d0f4b651b1a9a47cdddcae116a215a0e57
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -0,0 +1,233 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook[3, rulebook_len]:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      const bool subm,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  T* in_features_ptr = in_features.data<T>();
+  T* d_x_features_ptr = d_x_features.data<T>();
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->ResizeAndAllocate(kernel_dims);
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+
+  int half_kernel_size = kernel_size / 2;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(rulebook_len, 0);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  int offset = 0, max_count = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+    if (i < half_kernel_size) {
+      max_count = std::max(max_count, counter[i]);
+    }
+  }
+  offsets[kernel_size] = offset;
+
+  if (subm) {
+    phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
+                                                   x,
+                                                   kernel,
+                                                   out_grad,
+                                                   in_channels,
+                                                   out_channels,
+                                                   half_kernel_size,
+                                                   kernel_grad,
+                                                   x_grad);
+    if (max_count == 0) {
+      return;
+    }
+  }
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(
+      out_grad.non_zero_elements().data<T>(),
+      rulebook_ptr + rulebook_len * 2,
+      out_grad_features_ptr,
+      rulebook_len,
+      out_channels);
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
+  DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  SortedAndUniqueIndex(dev_ctx,
+                       rulebook_ptr + rulebook_len,
+                       rulebook_len,
+                       &out_index,
+                       &unique_key,
+                       &unique_value);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(d_x_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         x.nnz(),
+                                         rulebook_len,
+                                         in_channels,
+                                         x_grad_values_ptr,
+                                         subm);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1a0c7e9b972145fbc98cdb9dfee0267a9eaa9f90
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -0,0 +1,679 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+using Dims4D = phi::funcs::sparse::Dims4D;
+
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              int* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  int* out_indices,
+                                  int* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+__global__ void ProductRuleBookKernel(const int* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      const bool subm,
+                                      int* rulebook,
+                                      int* counter,
+                                      int* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// brief: calculation the distance between start and end
+__global__ void DistanceKernel(const int* start,
+                               const int* end,
+                               int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const DenseTensor& kernel,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          kernel_size * sizeof(int),
+                          dev_ctx.stream()>>>(indices_ptr,
+                                              d_x_dims,
+                                              d_kernel_dims,
+                                              d_out_dims,
+                                              non_zero_num,
+                                              d_paddings,
+                                              d_dilations,
+                                              d_strides,
+                                              subm,
+                                              rulebook_ptr,
+                                              counter_ptr,
+                                              in_indexs.data<int>());
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
+                             -1);
+
+  DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+
+    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<<<config.block_per_grid.x,
+                                    config.thread_per_block,
+                                    kernel_size * sizeof(int),
+                                    dev_ctx.stream()>>>(val_result.data<int>(),
+                                                        len,
+                                                        rulebook_len,
+                                                        kernel_size,
+                                                        rulebook_ptr,
+                                                        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  rulebook->Resize({rulebook_rows, rulebook_len});
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  int* out_index_ptr = out_index->data<int>();
+  int* unique_value_ptr = unique_value->data<int>();
+  int* unique_key_ptr = unique_key->data<int>();
+
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<<<1, 1>>>(unique_key_ptr,
+                           new_end,
+                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(unique_key_ptr,
+                                          unique_value_ptr,
+                                          out_index_ptr,
+                                          out_non_zero_num,
+                                          rulebook_len,
+                                          d_out_dims,
+                                          out_indices_ptr,
+                                          rulebook_ptr + 2 * rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  const bool subm,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  out->set_dims(out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensorMeta offsets_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
+        kernel.dims(), &subm_paddings, &subm_strides);
+  }
+
+  int n = ProductRuleBook<T, Context>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      subm_paddings,
+                                      dilations,
+                                      subm_strides,
+                                      out_dims,
+                                      subm,
+                                      rulebook,
+                                      &counter_per_kernel,
+                                      &offsets_per_kernel,
+                                      &out_index,
+                                      &unique_key,
+                                      &unique_value,
+                                      out,
+                                      &h_counter,
+                                      &offsets);
+
+  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* offsets_ptr = counter_per_kernel.data<int>();
+  const int* rulebook_ptr = rulebook->data<int>();
+
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  T* in_features_ptr = in_features.data<T>();
+  T* out_features_ptr = out_features.data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
+
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + n,
+                                             in_features_ptr,
+                                             n,
+                                             in_channels);
+
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto* out_values = out->mutable_non_zero_elements();
+  T* out_values_ptr = out_values->data<T>();
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (h_counter[i] <= 0) {
+      continue;
+    }
+
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = h_counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+
+  // 4. scatter
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, out->nnz() * out_channels, 1);
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(out_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         out->nnz(),
+                                         n,
+                                         out_channels,
+                                         out_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 2e741111fb1489aef5bdc51de637b77eec9d28a7..8048180e425ead98e6db15514caf38c406a2aebf 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 #include <thrust/remove.h>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 namespace phi {
@@ -115,14 +117,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
 #endif
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
 
   auto temp_indexs_meta =
       phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW);
   DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta));
   int* temp_indexs_ptr = temp_indexs.mutable_data<int>(place);
-  GetNonZeroNums<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  GetNonZeroNums<<<config.block_per_grid.x,
+                   config.thread_per_block.x,
+                   0,
+                   dev_ctx.stream()>>>(
       x_data, rows, cols, nums_ptr, temp_indexs_ptr);
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
@@ -167,7 +171,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
 
   dev_ctx.Wait();  // wait the copy
 
-  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  const auto values_dims =
+      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
   DenseTensorMeta indices_meta(DataType::INT64,
                                {sparse_dim, static_cast<int64_t>(non_zero_num)},
                                DataLayout::NCHW);
@@ -184,16 +189,18 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   T* sparse_data = values.mutable_data<T>(place);
 
   // 3. calc indices by indexs and get values by indexs
-  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
-  GetNonZeroElementsAndIndices<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      x_data,
-      sparse_dim,
-      cols,
-      d_x_dims.data<int64_t>(),
-      non_zero_num,
-      temp_indexs_ptr,
-      indices_data,
-      sparse_data);
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  GetNonZeroElementsAndIndices<<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(x_data,
+                                                     sparse_dim,
+                                                     cols,
+                                                     d_x_dims.data<int64_t>(),
+                                                     non_zero_num,
+                                                     temp_indexs_ptr,
+                                                     indices_data,
+                                                     sparse_data);
   out->SetMember(indices, values, x_dims, true);
 }
 
@@ -263,10 +270,9 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
   int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place);
   T* coo_values_data = values.mutable_data<T>(place);
 
-  int grid_size = 1, block_size = 1;
   if (batchs > 1) {
-    GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
-    GetBatchSizes<<<grid_size, block_size>>>(
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
+    GetBatchSizes<<<config.block_per_grid.x, config.thread_per_block.x>>>(
         csr_crows_data, rows, batchs, offsets_ptr);
 
 #ifdef PADDLE_WITH_HIP
@@ -279,9 +285,10 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
                            offsets_ptr);
   }
 
-  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
-  dim3 grids(grid_size, batchs, 1);
-  ConvertCsrCrowsToCooRows<<<grids, block_size>>>(
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  config.block_per_grid.y = batchs;
+  ConvertCsrCrowsToCooRows<<<config.block_per_grid,
+                             config.thread_per_block.x>>>(
       csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
 
 #ifdef PADDLE_WITH_HIP
@@ -404,21 +411,29 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
     // TODO(zhangkahuo): call coalesced() to distinct and sort the indices
   }
 
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
   if (batchs > 1) {
     DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW);
     phi::DenseTensor batchs_offset(
         phi::make_intrusive<paddle::experimental::SharedStorage>(place),
         std::move(batchs_meta));
     int64_t* batchs_offset_ptr = batchs_offset.mutable_data<int64_t>(place);
-    GetBatchsOffset<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+    GetBatchsOffset<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(
         batchs_ptr, non_zero_num, batchs_offset_ptr);
-    dim3 grids(grid_size, batchs, 1);
-    ConvertCooRowsToCsrCrows<<<grids, block_size, 0, dev_ctx.stream()>>>(
+    config.block_per_grid.y = batchs;
+    ConvertCooRowsToCsrCrows<<<config.block_per_grid,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
         batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   } else {
-    ConvertCooRowsToCsrCrows<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+    ConvertCooRowsToCsrCrows<<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
         nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   }
 
@@ -522,12 +537,13 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
 #endif
-  int grid_size = 1, block_size = 1;
-  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  KernelSparseCooToDense<
-      T,
-      int64_t><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+  KernelSparseCooToDense<T, int64_t><<<config.block_per_grid.x,
+                                       config.thread_per_block.x,
+                                       0,
+                                       dev_ctx.stream()>>>(
       indices.data<int64_t>(),
       d_sparse_offsets.data<int64_t>(),
       x_data,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d96d134a26b08a0208122a7ea9a62ce07c033d51..da05eb3d3cf7682e376efe59aaea8d09d1b6c757 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
@@ -22,37 +23,6 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
-inline const DDim InferDenseDims(const DDim& x_dims,
-                                 const int64_t sparse_dim,
-                                 const int64_t non_zero_num) {
-  auto dense_dim = x_dims.size() - sparse_dim;
-  DDim values_dims;
-  if (dense_dim) {
-    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
-    dense_dim_vec[0] = non_zero_num;
-    memcpy(&dense_dim_vec[1],
-           x_dims.Get() + sparse_dim,
-           dense_dim * sizeof(x_dims[0]));
-    values_dims = phi::make_ddim(dense_dim_vec);
-  } else {
-    values_dims = phi::make_ddim({non_zero_num});
-  }
-  return values_dims;
-}
-
-template <typename Context>
-inline void GetGpuLaunchConfig1D(const Context& dev_ctx,
-                                 const int64_t n,
-                                 int* grid_size,
-                                 int* block_size) {
-  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
-  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
-                                     : (1 << static_cast<int>(std::log2(n)));
-  *grid_size = n / *block_size;
-  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
-}
-
 template <typename T, typename Context>
 void DenseToSparseCooKernel(const Context& dev_ctx,
                             const DenseTensor& x,
@@ -63,8 +33,8 @@ template <typename T, typename Context>
 SparseCooTensor DenseToSparseCoo(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const int64_t sparse_dim) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   return coo;
@@ -78,8 +48,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCooTensor SparseCsrToCoo(const Context& dev_ctx,
                                const SparseCsrTensor& x) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
@@ -93,9 +63,9 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCsrTensor SparseCooToCsr(const Context& dev_ctx,
                                const SparseCooTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   SparseCooToCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -113,8 +83,8 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
                     phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   SparseCooToCsrKernel<T, Context>(dev_ctx, coo, out);
@@ -122,9 +92,9 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 SparseCsrTensor DenseToSparseCsr(const Context& dev_ctx, const DenseTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   DenseToSparseCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -148,8 +118,8 @@ template <typename T, typename Context>
 void SparseCsrToDenseKernel(const Context& dev_ctx,
                             const SparseCsrTensor& x,
                             DenseTensor* out) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   SparseCooToDenseKernel<T, Context>(dev_ctx, coo, out);
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 840fe4366ce7eaca82608612dfb41cc7f7783f4c..e42b25e60c42291b3949c144c4aba6f62ec98a44 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -50,7 +50,7 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
   result.reserve(out_number);
 
   for (size_t i = 0; i < out_number; ++i) {
-    result.emplace_back(phi::Empty<T, Context>(dev_ctx));
+    result.emplace_back(DenseTensor());
     out_meta.emplace_back(&result.back());
     out_meta_ptr.push_back(&out_meta.back());
   }
diff --git a/paddle/phi/kernels/take_along_axis_grad_kernel.h b/paddle/phi/kernels/take_along_axis_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a312c235f66fc6f58f7bbb2a29f78d8ff4b427eb
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/take_along_axis_kernel.h b/paddle/phi/kernels/take_along_axis_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8fb78556d9bbe9c6ceced660a03c7ddc3dd1dc6
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/tile_grad_kernel.h b/paddle/phi/kernels/tile_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..830276c28e05395c36854426e560fbe6f2642589
--- /dev/null
+++ b/paddle/phi/kernels/tile_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& repeat_times,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tile_kernel.h b/paddle/phi/kernels/tile_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..924d0149fe34598753f0d02cfa3b087e2e7493f2
--- /dev/null
+++ b/paddle/phi/kernels/tile_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& repeat_times,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_grad_kernel.h b/paddle/phi/kernels/top_k_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f577b982c575dc7c158ade047bc2668cec358725
--- /dev/null
+++ b/paddle/phi/kernels/top_k_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_kernel.h b/paddle/phi/kernels/top_k_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fea76e448b5438634b8475c4fb783688d62c4905
--- /dev/null
+++ b/paddle/phi/kernels/top_k_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 303b4a9a8f05d440dc0a2878574cc951ef5ec1a7..b8d7fbaa2757d76db1005ce57498d181046d77c9 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -15,7 +15,10 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
@@ -25,4 +28,26 @@ void TransposeKernel(const Context& dev_ctx,
                      const std::vector<int>& axis,
                      DenseTensor* out);
 
+template <typename T, typename Context>
+DenseTensor Transpose(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int>& axis) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  TransposeInferMeta(x, axis, &meta_out);
+  TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor TransposeLast2Dim(const Context& dev_ctx, const DenseTensor& x) {
+  size_t rank = x.dims().size();
+  std::vector<int> axis(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    axis[i] = i;
+  }
+  std::swap(axis[rank - 1], axis[rank - 2]);
+  return Transpose<T, Context>(dev_ctx, x, axis);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb5a5ab461a1dcbbdec916dff57e65df5d9cfd9b
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_kernel.h b/paddle/phi/kernels/triangular_solve_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..833de3f8439ee843577306ff146fa01ad4225390
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
index 0370cc431fef9cab69861b7f707f65c897e20fa2..c4c13578a989961839600d8ee403e478c76d1345 100644
--- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace phi {
 
@@ -140,25 +141,15 @@ T Erfinv(T x) {
 template <typename T>
 struct TruncatedNormal {
   T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
-    auto normal_cdf = [](T x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf(-2.0);
-    b_normal_cdf = normal_cdf(2.0);
-  }
-
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {}
   T operator()(T value) const {
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
+    return std::sqrt(2.0) * Erfinv(value) * std + mean;
   }
 };
 
 template <typename T, typename Context>
-void TruncatedGaussianRandomKernel(const Context& ctx,
-                                   const ScalarArray& shape,
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h
index 5bba127278541e61b142dbeb7d00f10ed3f8437b..36ce4c3f9eef58a55d41e0230c3ebfc3e768e6b3 100644
--- a/paddle/phi/kernels/uniform_random_kernel.h
+++ b/paddle/phi/kernels/uniform_random_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/selected_rows.h"
 
 namespace phi {
 
@@ -42,25 +41,4 @@ void UniformRandomKernel(const Context& dev_ctx,
                          int seed,
                          DenseTensor* out);
 
-template <typename T, typename Context>
-void UniformRandomRawSRKernel(const Context& dev_ctx,
-                              const ScalarArray& shape,
-                              DataType dtype,
-                              float min,
-                              float max,
-                              int seed,
-                              int diag_num,
-                              int diag_step,
-                              float diag_val,
-                              SelectedRows* out);
-
-template <typename T, typename Context>
-void UniformRandomSRKernel(const Context& dev_ctx,
-                           const ScalarArray& shape,
-                           DataType dtype,
-                           float min,
-                           float max,
-                           int seed,
-                           SelectedRows* out);
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/viterbi_decode_kernel.h b/paddle/phi/kernels/viterbi_decode_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..27eb94d89cec4a30a056490b40c13332fd808711
--- /dev/null
+++ b/paddle/phi/kernels/viterbi_decode_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_index_kernel.h b/paddle/phi/kernels/where_index_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..68b094637c8d55ba424f61f3afea642db073c13f
--- /dev/null
+++ b/paddle/phi/kernels/where_index_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/where_index_kernel.cc b/paddle/phi/kernels/xpu/where_index_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6653e57f6eadf3e359a9281ce5737277d83b206
--- /dev/null
+++ b/paddle/phi/kernels/xpu/where_index_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
+  int true_num_cpu;
+  int ret = xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      XPU_SUCCESS,
+      phi::errors::External(
+          "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
+          ret,
+          XPUAPIErrorMsg[ret]));
+
+  paddle::memory::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&true_num_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(true_num),
+                       sizeof(int32_t));
+
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
+  auto* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num_cpu == 0) {
+    return;
+  }
+
+  auto condition_shape = phi::vectorize<int>(dims);
+  ret = xpu::where(
+      dev_ctx.x_context(), cond_data, out_data, condition_shape, true_num_cpu);
+  PADDLE_ENFORCE_EQ(ret,
+                    XPU_SUCCESS,
+                    phi::errors::External(
+                        "XPU masked_select kernel return wrong value[%d %s]",
+                        ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where_index, XPU, ALL_LAYOUT, phi::WhereIndexKernel, int, bool, float) {}
diff --git a/paddle/phi/kernels/yolo_box_kernel.h b/paddle/phi/kernels/yolo_box_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9553d300cad2b78719958077aea297a500c9359a
--- /dev/null
+++ b/paddle/phi/kernels/yolo_box_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbfca5b17ae995a89360c6d6d4987028d95dc281
--- /dev/null
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(          \
+      const ArgumentMappingContext& ctx) {                   \
+    return KernelSignature(op_name "_grad",                  \
+                           {"X", GradVarName("Out")},        \
+                           {attrs},                          \
+                           {GradVarName("X")});              \
+  }
+
+#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(            \
+      const ArgumentMappingContext& ctx) {                     \
+    return KernelSignature(op_name "_grad",                    \
+                           {"Out", GradVarName("Out")},        \
+                           {attrs},                            \
+                           {GradVarName("X")});                \
+  }
+
+#define comma ,
+
+DefineActGradDepXOpArgMap(Cos, "cos", );                           // NOLINT
+DefineActGradDepXOpArgMap(Tan, "tan", );                           // NOLINT
+DefineActGradDepXOpArgMap(Acos, "acos", );                         // NOLINT
+DefineActGradDepXOpArgMap(Sin, "sin", );                           // NOLINT
+DefineActGradDepXOpArgMap(Asin, "asin", );                         // NOLINT
+DefineActGradDepXOpArgMap(Atan, "atan", );                         // NOLINT
+DefineActGradDepXOpArgMap(Sinh, "sinh", );                         // NOLINT
+DefineActGradDepXOpArgMap(Cosh, "cosh", );                         // NOLINT
+DefineActGradDepXOpArgMap(Asinh, "asinh", );                       // NOLINT
+DefineActGradDepXOpArgMap(Acosh, "acosh", );                       // NOLINT
+DefineActGradDepXOpArgMap(Atanh, "atanh", );                       // NOLINT
+DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max");  // NOLINT
+DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha");       // NOLINT
+DefineActGradDepXOpArgMap(ThresholdedRelu,
+                          "thresholded_relu",
+                          "threshold");  // NOLINT
+
+DefineActGradDepOutOpArgMap(Relu, "relu", );  // NOLINT
+DefineActGradDepOutOpArgMap(Tanh, "tanh", );  // NOLINT
+
+KernelSignature ReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
+}
+
+KernelSignature TanhDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature TanhTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tanh_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature LeakyReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"});
+}
+
+KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
+                           phi::ReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad,
+                           phi::TanhDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad,
+                           phi::TanhTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad,
+                           phi::LeakyReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
+                           phi::LeakyReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
+                           phi::ThresholdedReluGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/allclose_sig.cc b/paddle/phi/ops/compat/allclose_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5c4fc027b54225cfdbcc67498eed18789922bd3
--- /dev/null
+++ b/paddle/phi/ops/compat/allclose_sig.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AllCloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("allclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllCloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62133a441ff126af9c1d65cf7c8af6f6571d8b32
--- /dev/null
+++ b/paddle/phi/ops/compat/argsort_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ArgsortGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("argsort_grad",
+                         {"Indices", "X", GradVarName("Out")},
+                         {"axis", "descending"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..011d4c12ecefc5b69eec4bf15425aaa648666159
--- /dev/null
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature BatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "batch_norm_grad",
+      {GradVarName("Y"),
+       "X",
+       "Scale",
+       "Bias",
+       "SavedMean",
+       "SavedVariance",
+       "ReserveSpace",
+       "Mean",
+       "Variance"},
+      {"momentum",
+       "epsilon",
+       "data_layout",
+       "is_test",
+       "use_global_stats",
+       "trainable_statistics",
+       "fuse_with_relu"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+KernelSignature BatchNormGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm_grad_grad",
+                         {"DDX",
+                          "DDScale",
+                          "DDBias",
+                          "DY",
+                          "X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "Mean",
+                          "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"DX", "DScale", "DDY"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
+                           phi::BatchNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
+                           phi::BatchNormGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/bincount_sig.cc b/paddle/phi/ops/compat/bincount_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35067c256ed495b0e2156bef87c943334e0ef61f
--- /dev/null
+++ b/paddle/phi/ops/compat/bincount_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cholesky_solve_sig.cc b/paddle/phi/ops/compat/cholesky_solve_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a9759f8352a0e073fcd282c6ec40f73adea9e7f
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskySolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky_solve_grad,
+                           phi::CholeskySolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/compare_sig.cc b/paddle/phi/ops/compat/compare_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..964c7be3db3cff7ce79cf5ec619fc6d26fb99e0e
--- /dev/null
+++ b/paddle/phi/ops/compat/compare_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LessThanArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("less_than", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature LessEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("less_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature GreaterThanArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("greater_than", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature GreaterEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("greater_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature EqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature NotEqualArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("not_equal", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
+KernelSignature EqualAllArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("equal_all", {"X", "Y"}, {}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(less_than, phi::LessThanArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(less_equal, phi::LessEqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(greater_than, phi::GreaterThanArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(greater_equal, phi::GreaterEqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(equal, phi::EqualArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(not_equal, phi::NotEqualArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(equal_all, phi::EqualAllArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a755fdb19ec4b86d4b5265c1d6bce5eecdb9b5b3
--- /dev/null
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+
+KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad,
+                           phi::Conv2dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a036afac82a8d49455b1a226e62f5fe757d4b4b9
--- /dev/null
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+
+KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv3dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad,
+                           phi::Conv3dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59b4eabfa47aa2693c4070ca3e4d8c54f5a42d32
--- /dev/null
+++ b/paddle/phi/ops/compat/cumprod_sig.cc
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CumprodGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cumprod_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2b6801f73bcdb0f20090e3ea7e75b7257bde4e3
--- /dev/null
+++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DepthwiseConv2dOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"Output"});
+}
+
+KernelSignature DepthwiseConv2dGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d,
+                           phi::DepthwiseConv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad,
+                           phi::DepthwiseConv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad,
+                           phi::DepthwiseConv2dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc
index 0a14b9095c8343f47e1d6aa039c9aced963984ce..f3245b922c0d913a87b58f813bd0ca142ecb6287 100644
--- a/paddle/phi/ops/compat/diag_sig.cc
+++ b/paddle/phi/ops/compat/diag_sig.cc
@@ -20,8 +20,15 @@ KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"});
 }
 
+KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "diag_grad", {"X", GradVarName("Out")}, {"offset"}, {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag);
+PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc
index fa693f92c6fe3ade527953b632bf94cf4c1b10c1..12ef3056f1e680398e7ded901e72ed201d2f4a17 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/ops/compat/digamma_sig.cc
@@ -19,7 +19,7 @@ namespace phi {
 KernelSignature DigammaGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "digamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+      "digamma_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18a30b9b840483d7df3b3b009d079aea35a7d6bc
--- /dev/null
+++ b/paddle/phi/ops/compat/dist_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("dist_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"p"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bf229c98bd07face5a5ba7778318cf1662f29a9
--- /dev/null
+++ b/paddle/phi/ops/compat/dropout_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DropoutOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "dropout",
+      {"X", "Seed"},
+      {"dropout_prob", "is_test", "dropout_implementation", "seed", "fix_seed"},
+      {"Out", "Mask"});
+}
+
+KernelSignature DropoutGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("dropout_grad",
+                         {"Mask", GradVarName("Out")},
+                         {"dropout_prob", "is_test", "dropout_implementation"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dropout, phi::DropoutOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_grad, phi::DropoutGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/eigh_sig.cc b/paddle/phi/ops/compat/eigh_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e50a9a5a12a56493b8b81a1eacdb12051a3c362f
--- /dev/null
+++ b/paddle/phi/ops/compat/eigh_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("eigh_grad",
+                         {"Eigenvalues",
+                          "Eigenvectors",
+                          GradVarName("Eigenvalues"),
+                          GradVarName("Eigenvectors")},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index cddebcbce1273a2f88d7b1ce50f1c340d313ecf0..1d2aaa04f05d205483dbda5c738c7499ad068881 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -100,6 +100,69 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+}
+
+KernelSignature ElementwiseDivGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseFMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_fmin_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_double_grad",
+                         {"Y", "Out", "DX", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("Y"), "DOut", "DDOut"});
+}
+
+KernelSignature ElementwiseMulGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseFMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_fmax_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseMulDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_double_grad",
+                         {"X", "Y", "DOut", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y"), "DDOut"});
+}
+
+KernelSignature ElementwiseMulTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiply_triple_grad",
+      {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"},
+      {"axis"},
+      {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -110,6 +173,12 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -127,3 +196,21 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
                            phi::ElementwiseAddTripleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad,
+                           phi::ElementwiseSubDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
+                           phi::ElementwiseDivGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
+                           phi::ElementwiseDivDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad,
+                           phi::ElementwiseMulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
+                           phi::ElementwiseMulDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
+                           phi::ElementwiseMulTripleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad,
+                           phi::ElementwiseFMaxGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad,
+                           phi::ElementwiseFMinGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/erf_sig.cc b/paddle/phi/ops/compat/erf_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..784727a98042db820b4f83bac84014ebd0e1302e
--- /dev/null
+++ b/paddle/phi/ops/compat/erf_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "erf_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(erf_grad, phi::ErfGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/expand_as_sig.cc b/paddle/phi/ops/compat/expand_as_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a616b63c10b3c6ef9bca8c906655da99e8912244
--- /dev/null
+++ b/paddle/phi/ops/compat/expand_as_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ExpandAsOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"});
+}
+
+KernelSignature ExpandAsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("expand_as_grad",
+                         {"X", GradVarName("Out")},
+                         {"target_shape"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as);
+PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad,
+                           phi::ExpandAsGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gather_scatter_sig.cc b/paddle/phi/ops/compat/gather_scatter_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f71e30f85b09df041b02fbd4f34b69c0e85f92da
--- /dev/null
+++ b/paddle/phi/ops/compat/gather_scatter_sig.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GatherNdGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gather_nd_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {},
+                         {GradVarName("X")});
+}
+
+KernelSignature ScatterGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("scatter_grad",
+                         {"Ids", "Updates", GradVarName("Out")},
+                         {"overwrite"},
+                         {GradVarName("X"), GradVarName("Updates")});
+}
+
+KernelSignature ScatterNdAddGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("scatter_nd_add_grad",
+                         {"Index", "Updates", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Updates")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gather_nd_grad, phi::GatherNdGradArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scatter_grad, phi::ScatterGradArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add_grad,
+                           phi::ScatterNdAddGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f2b157e4c0f950c75ce2d8a66455127f51752fa
--- /dev/null
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+  if (ctx.InputSize("ShapeTensorList") > 0) {
+    // Infer output shape by Attr("shape") in CompileTime if it is specified.
+    if (!ctx.IsRuntime() && !shape.empty()) {
+      return KernelSignature("gaussian_random",
+                             {},
+                             {"shape", "mean", "std", "seed", "dtype"},
+                             {"Out"});
+    } else {
+      return KernelSignature(
+          "gaussian_random",
+          {},
+          {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+          {"Out"});
+    }
+  }
+
+  if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensor", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+
+  return KernelSignature("gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
+                           phi::GaussianRandomOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dacb8b25a89f9c151198f573c3fc4fda37e2939e
--- /dev/null
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GraphSendRecvGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "graph_send_recv_grad",
+      {GradVarName("Out"), "X", "Out", "Src_index", "Dst_index", "Dst_count"},
+      {"pool_type"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
+                           phi::GraphSendRecvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isclose_sig.cc b/paddle/phi/ops/compat/isclose_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08632e990958dd5e19d46c7c5a0ba093c10e65f1
--- /dev/null
+++ b/paddle/phi/ops/compat/isclose_sig.cc
@@ -0,0 +1,50 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isfinite_sig.cc b/paddle/phi/ops/compat/isfinite_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..218b4c2f962c48fda233d290e0bb1bdb667a724d
--- /dev/null
+++ b/paddle/phi/ops/compat/isfinite_sig.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf);
+PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan);
+PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite);
diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22d2f074e9f13c7ba65c6bcbb4b5542881d4128c
--- /dev/null
+++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KLDivLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kldiv_loss_grad",
+                         {"X", "Target", GradVarName("Loss")},
+                         {"reduction"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad,
+                           phi::KLDivLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kron_sig.cc b/paddle/phi/ops/compat/kron_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06b6545f58e7c12964f82fd8b6199270c519c16a
--- /dev/null
+++ b/paddle/phi/ops/compat/kron_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("kron_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..968ad4923ba7b4410f0643335c275059e6ea7bea
--- /dev/null
+++ b/paddle/phi/ops/compat/lgamma_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "lgamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4ae746e975a7ef7fe3de26cbe16aa221bca8164
--- /dev/null
+++ b/paddle/phi/ops/compat/log_loss_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("log_loss_grad",
+                         {"Predicted", "Labels", GradVarName("Loss")},
+                         {"epsilon"},
+                         {GradVarName("Predicted")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matrix_power_sig.cc b/paddle/phi/ops/compat/matrix_power_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c9ad4e74ab460c905f5c9e11f64cf8fa332dad0
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_power_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MatrixPowerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("matrix_power_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"n"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad,
+                           phi::MatrixPowerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40dc29579b40194f57911df0bfb426de4369d9b3
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_rank_sig.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+// we have to return every specific KernelSignature for infrt now
+KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("TolTensor")) {
+    return KernelSignature("matrix_rank_tol",
+                           {"X", "TolTensor"},
+                           {"use_default_tol", "hermitian"},
+                           {"Out"});
+  } else {
+    return KernelSignature("matrix_rank",
+                           {"X"},
+                           {
+                               "tol", "use_default_tol", "hermitian",
+                           },
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_rank, phi::MatrixRankOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d16dd1c8617fe224c3e3fb7ab8dfc6cb5b2d2d63
--- /dev/null
+++ b/paddle/phi/ops/compat/maxout_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"});
+}
+
+KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"groups", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..598cbd980f3cc5ece0168b9cbb4f91e654f3f8ab
--- /dev/null
+++ b/paddle/phi/ops/compat/multi_dot_sig.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiDotGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc
index ab0d31ee31dab99125a28d5b9f662d25d8e408d0..0012f8e1ccb41169175b9f539b839850bd901b82 100644
--- a/paddle/phi/ops/compat/mv_sig.cc
+++ b/paddle/phi/ops/compat/mv_sig.cc
@@ -16,10 +16,6 @@
 
 namespace phi {
 
-KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"});
-}
-
 KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("mv_grad",
                          {"X", "Vec", GradVarName("Out")},
@@ -29,5 +25,4 @@ KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f274d7f77c5c0aac853263b28cf24076944f4fd9
--- /dev/null
+++ b/paddle/phi/ops/compat/nll_loss_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  // TODO(xiongkun): can't remove the forward mapping, because the Weight is
+  // optional
+  return KernelSignature("nll_loss",
+                         {"X", "Label", "Weight"},
+                         {"ignore_index", "reduction"},
+                         {"Out", "Total_weight"});
+}
+
+KernelSignature NllLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nll_loss_grad",
+      {"X", "Label", "Total_weight", "Weight", GradVarName("Out")},
+      {"ignore_index", "reduction"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc
index 81d294b84248578d5f29b3b2d432e81f3499e9fa..a74db9b5686c8d79c4a59bb55a33264443ddf886 100644
--- a/paddle/phi/ops/compat/norm_sig.cc
+++ b/paddle/phi/ops/compat/norm_sig.cc
@@ -23,7 +23,7 @@ KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("norm_grad",
-                         {GradVarName("Out"), "X", "Norm"},
+                         {"X", "Norm", GradVarName("Out")},
                          {"axis", "epsilon", "is_test"},
                          {GradVarName("X")});
 }
diff --git a/paddle/phi/ops/compat/one_hot_sig.cc b/paddle/phi/ops/compat/one_hot_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..655969093c889aa32ae780f1de3c9c7c81a78eb1
--- /dev/null
+++ b/paddle/phi/ops/compat/one_hot_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature OneHotOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("depth_tensor")) {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth_tensor", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  } else {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot);
+
+PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/phi/ops/compat/pad_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4eadbfa98beded121c4e6738384487a9ec10be42
--- /dev/null
+++ b/paddle/phi/ops/compat/pad_sig.cc
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pad_grad",
+                         {GradVarName("Out")},
+                         {"paddings", "pad_value"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..390d3db5e785ba7642213e9b7a8db2b718ff19f0
--- /dev/null
+++ b/paddle/phi/ops/compat/pool_sig.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {GradVarName("X")});
+}
+
+KernelSignature Pool2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool2d_double_grad",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature MaxPool2dWithIndexOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool2d_with_index",
+      {"X"},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {"Out", "Mask"});
+}
+
+KernelSignature MaxPool2dWithIndexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool2d_with_index_grad",
+      {"X", "Mask", GradVarName("Out")},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {GradVarName("X")});
+}
+
+KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool3d",
+                         {"X"},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {"Out"});
+}
+
+KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pool3d_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"ksize",
+                          "strides",
+                          "paddings",
+                          "ceil_mode",
+                          "exclusive",
+                          "data_format",
+                          "pooling_type",
+                          "global_pooling",
+                          "adaptive",
+                          "padding_algorithm"},
+                         {GradVarName("X")});
+}
+
+KernelSignature MaxPool3dWithIndexOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool3d_with_index",
+      {"X"},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {"Out", "Mask"});
+}
+
+KernelSignature MaxPool3dWithIndexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_pool3d_with_index_grad",
+      {"X", "Mask", GradVarName("Out")},
+      {"ksize", "strides", "paddings", "global_pooling", "adaptive"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad,
+                           phi::Pool2dDoubleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index,
+                           phi::MaxPool2dWithIndexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad,
+                           phi::MaxPool2dWithIndexGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index,
+                           phi::MaxPool3dWithIndexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad,
+                           phi::MaxPool3dWithIndexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d694d9a7759d9e3cdf0c385164a367260f2a020
--- /dev/null
+++ b/paddle/phi/ops/compat/psroi_pool_sig.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool",
+      {"X", "ROIs", "RoisNum"},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {"Out"});
+}
+
+KernelSignature PsroiPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "psroi_pool_grad",
+      {"X", "ROIs", "RoisNum", GradVarName("Out")},
+      {"pooled_height", "pooled_width", "output_channels", "spatial_scale"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad,
+                           phi::PsroiPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f8dc1cf4cd711c1d39c0730a2a8b4ab86c57bea
--- /dev/null
+++ b/paddle/phi/ops/compat/put_along_axis_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis",
+                         {"Input", "Index", "Value"},
+                         {"Axis", "Reduce"},
+                         {"Result"});
+}
+
+KernelSignature PutAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis", "Reduce"},
+                         {GradVarName("Input"), GradVarName("Value")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad,
+                           phi::PutAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 92839fb3030752739e2387df1d449d611970d045..dcb00fe1b0cceb978ad24eda10ef78e339642d75 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "sum_raw" KernelSignature.
-    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // And the InferMeta function(i.e. SumRawInferMeta) is accordance with
     // the "sum_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature("sum_raw",
@@ -40,8 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
-    // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the
-    // "mean_raw" KernelSignature
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -56,11 +56,97 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
       "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
 }
 
+KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "min_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "min_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "min_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("min", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "any_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "any_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "any_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("any", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "all_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "all_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("all", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceSumGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sum_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
+
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
+                           phi::ReduceSumGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0549103b6fbcb8b2367c34c8a44fb3b52f318859
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97646a2ac31d33fe8b0bd09c1a205122b4f3fd6c
--- /dev/null
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SegmentPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "segment_pool_grad",
+      {
+          "X", "SegmentIds", "Out", "SummedIds", GradVarName("Out"),
+      },
+      {"pooltype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad,
+                           phi::SegmentPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9653250bded84f8ff87f613f6e17e50e351504fa
--- /dev/null
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -0,0 +1,838 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Input")) {
+    if (ctx.HasInput("StartsTensorList")) {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    } else {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature SetValueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("StartsTensorList")) {
+    if (ctx.HasInput("EndsTensorList")) {
+      if (ctx.HasInput("StepsTensorList")) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "EndsTensorList",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "EndsTensorList",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    } else {
+      if (ctx.HasInput("StepsTensorList")) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "ends",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"StartsTensorList",
+             "ends",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    }
+  } else {
+    if (ctx.HasInput("EndsTensorList")) {
+      if (ctx.HasInput("StepsTensorList")) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "EndsTensorList",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "EndsTensorList",
+             "steps",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    } else {
+      if (ctx.HasInput("StepsTensorList")) {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts",
+             "ends",
+             "StepsTensorList",
+             "axes",
+             "decrease_axes",
+             "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      } else {
+        return KernelSignature(
+            "set_value_grad",
+            {GradVarName("Out")},
+            {"starts", "ends", "steps", "axes", "decrease_axes", "none_axes"},
+            {GradVarName("Input"), GradVarName("ValueTensor")});
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(set_value_grad, phi::SetValueGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61ad9627a9612d62318939af8efda3a541cfa606
--- /dev/null
+++ b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_cross_entropy_with_logits_grad",
+                         {"X", "Label", GradVarName("Out")},
+                         {"normalize", "ignore_index"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(
+    sigmoid_cross_entropy_with_logits_grad,
+    phi::SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27a996a270ddf4a36ad694836f45cb304f9a8f4c
--- /dev/null
+++ b/paddle/phi/ops/compat/take_along_axis_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TakeAlongAxisArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"});
+}
+
+KernelSignature TakeAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("take_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis"},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad,
+                           phi::TakeAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49a6d02225d931f1dc2d3324cb13c2c620f5dfe6
--- /dev/null
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
+  } else {
+    return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+  }
+}
+
+KernelSignature TileGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"RepeatTimes"},
+                           {GradVarName("X")});
+  } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times_tensor"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("tile_grad",
+                           {"X", GradVarName("Out")},
+                           {"repeat_times"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tile, phi::TileOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tile_grad, phi::TileGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bf922b3d1b58979df25fbccc4d1dfe680f858a4
--- /dev/null
+++ b/paddle/phi/ops/compat/top_k_sig.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("K")) {
+    return KernelSignature(
+        "top_k", {"X"}, {"K", "axis", "largest", "sorted"}, {"Out", "Indices"});
+
+  } else {
+    return KernelSignature(
+        "top_k", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"});
+  }
+}
+
+KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("top_k_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "largest", "sorted"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, top_k);
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, top_k_grad);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopkOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopkGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c56af3e21e53e9ded6d01ad7fdb9c0fb5609ea6c
--- /dev/null
+++ b/paddle/phi/ops/compat/triangular_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TriangularSolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("triangular_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper", "transpose", "unitriangular"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad,
+                           phi::TriangularSolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/yolo_box_sig.cc b/paddle/phi/ops/compat/yolo_box_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb39e72a64f5075908ca9b28d5f685fb0d6b6c9f
--- /dev/null
+++ b/paddle/phi/ops/compat/yolo_box_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("yolo_box",
+                         {"X", "ImgSize"},
+                         {"anchors",
+                          "class_num",
+                          "conf_thresh",
+                          "downsample_ratio",
+                          "clip_bbox",
+                          "scale_x_y",
+                          "iou_aware",
+                          "iou_aware_factor"},
+                         {"Boxes", "Scores"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index cde085423e482e62a280815700ead9a0b6c64262..d998ab9435c027afcae0446fe3d31b3f000f1d15 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,27 +1,29 @@
 if(WITH_ROCM)
-  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 else()
-  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog)
+  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
 endif()
 
 cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
 
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
+set(COMMON_API_TEST_DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_split_api SRCS test_split_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index d93f00129b9a14170b979dfd23eb6e292e996ce8..6b9bb7aecefe6fabb2334e6f4d150a3937628f34 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_cast_api.cc b/paddle/phi/tests/api/test_cast_api.cc
index 276a70066ba73f3831c2a4bab0b47566848599f5..5448fb9d424709c34dc2008ef1ad7042c02d6630 100644
--- a/paddle/phi/tests/api/test_cast_api.cc
+++ b/paddle/phi/tests/api/test_cast_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_concat_api.cc b/paddle/phi/tests/api/test_concat_api.cc
index d5a36f56bfa1b34a23e8bc4738fcec04225858a3..824b72b97ac129cc0f011dc2bc7279b4c1b362a9 100644
--- a/paddle/phi/tests/api/test_concat_api.cc
+++ b/paddle/phi/tests/api/test_concat_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(concat, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_conj_api.cc b/paddle/phi/tests/api/test_conj_api.cc
index 9c438de9297fb675fa391e3f32cee4761631e5b6..62a588dff12804391df971681b959d84a9c3ca7b 100644
--- a/paddle/phi/tests/api/test_conj_api.cc
+++ b/paddle/phi/tests/api/test_conj_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index a3c497bd427ae040b33dce241a70ecaafee5fbcc..dd008ff36d50a8e30166d478b916ac180d564c7c 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -19,6 +19,16 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_dot_api.cc b/paddle/phi/tests/api/test_dot_api.cc
index e48004653d638121b2b7e6e7047d3dfb51cd76c1..3fcd4e8a01d126f2d4f0fbaaacb7580b2a6b3163 100644
--- a/paddle/phi/tests/api/test_dot_api.cc
+++ b/paddle/phi/tests/api/test_dot_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index cebf4e003aafa928341481952606f18642881a1c..d4013a788c76cb4f049574ee893320088fe7ac2c 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_empty_api.cc b/paddle/phi/tests/api/test_empty_api.cc
index dc5618f0aae8ad2df85ef6c466457a7c6b303ad5..48adbe1bd2682c3096cb76e24ddc8e2f719a532e 100644
--- a/paddle/phi/tests/api/test_empty_api.cc
+++ b/paddle/phi/tests/api/test_empty_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index 9b434aef811956bc6697193947a4756689c5dba3..bf57574d39093cc81a47f8385097627eb98648af 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_flatten_api.cc b/paddle/phi/tests/api/test_flatten_api.cc
index e1360e8e27bff08397031e15377a1cf4a04c4ec7..f1c8935e266409dfd4aa4f86d59a87c2767b4eb9 100644
--- a/paddle/phi/tests/api/test_flatten_api.cc
+++ b/paddle/phi/tests/api/test_flatten_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index 2a3dd9c7dff62071fcd7dcf18cddcc5946ff7480..e2c324a6775c8f697c5871dcfe10a830244a103c 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -26,6 +26,15 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_double_grad, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_mean_api.cc b/paddle/phi/tests/api/test_mean_api.cc
index 53be1b1e9dc9c29bddad810d740d0f7f05d28aa0..af47f2cd7714a2431b62acdedc61fea5a139bc49 100644
--- a/paddle/phi/tests/api/test_mean_api.cc
+++ b/paddle/phi/tests/api/test_mean_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc
index dc2883c1794e2c986ed5446981b749f5f4dd0bc2..74ed648f3ee6ec4e2b763f4a57cc58ce79fb25b9 100644
--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -16,6 +16,13 @@
 #include "gtest/gtest.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_reshape_api.cc b/paddle/phi/tests/api/test_reshape_api.cc
index 60281a9f4992344e19b8a05a3002d964695de94c..4a857e2d1dcda906ec2be5f1d9a20b51dbb53b22 100644
--- a/paddle/phi/tests/api/test_reshape_api.cc
+++ b/paddle/phi/tests/api/test_reshape_api.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index 52e8ae630e0e535e68e5fc9b76c1e9aca01ddd2c..a40ecc8485e4a96e96f40d513a153df4cc2eb0c3 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -19,8 +19,13 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/selected_rows.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index 9c0b0fc576ebc7363d9c726a759f077a212c7eb8..05a5563344966e4d0664a83051dc84b3f2da499a 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/tests/api/scale_api.h"
 #include "paddle/phi/tests/core/timer.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_slice_api.cc b/paddle/phi/tests/api/test_slice_api.cc
index c3f5fdcb36d62c926c6afc65bcc1ed9327983c69..ee2ade0229f1f3c9b59b87c2ffb37cf42ec1b531 100644
--- a/paddle/phi/tests/api/test_slice_api.cc
+++ b/paddle/phi/tests/api/test_slice_api.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c4aa164259071667e3d90994759c05454f407ff
--- /dev/null
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/api/include/sparse_api.h"
+
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT);
+
+template <typename T>
+void TestConv3dBase(const std::vector<int>& indices,
+                    const std::vector<T>& features,
+                    const phi::DDim& x_dims,
+                    const std::vector<T>& kernel,
+                    const phi::DDim& kernel_dims,
+                    const std::vector<int>& correct_out_indices,
+                    const std::vector<T>& correct_out_features,
+                    const phi::DDim& correct_out_dims,
+                    const int non_zero_num,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& dilations,
+                    const float diff = 1e-3) {
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+
+  phi::DenseTensor indices_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+
+  phi::DenseTensor features_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           {non_zero_num, in_channels},
+                           phi::DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  auto x_tensor = std::make_shared<phi::SparseCooTensor>(
+      indices_tensor, features_tensor, x_dims);
+  paddle::experimental::Tensor x(x_tensor);
+
+  auto kernel_tensor = std::make_shared<phi::DenseTensor>(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           kernel_dims,
+                           phi::DataLayout::NHWC));
+  paddle::experimental::Tensor weight(kernel_tensor);
+
+  memcpy(kernel_tensor->mutable_data<T>(paddle::platform::CPUPlace()),
+         kernel.data(),
+         kernel.size() * sizeof(T));
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    auto outs = paddle::experimental::sparse::conv3d(
+        x, weight, paddings, dilations, strides, 1, false);
+
+    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
+        std::get<0>(outs).impl());
+    ASSERT_EQ(correct_out_dims.size(), out->dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out->non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(
+          correct_out_features[i] - out->non_zero_elements().data<T>()[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  }
+}
+
+void TestConv3d(const std::vector<int>& indices,
+                const std::vector<float>& features,
+                const phi::DDim& x_dims,
+                const std::vector<float>& kernel,
+                const phi::DDim& kernel_dims,
+                const std::vector<int>& correct_out_indices,
+                const std::vector<float>& correct_out_features,
+                const phi::DDim& correct_out_dims,
+                const int non_zero_num,
+                const std::vector<int>& paddings,
+                const std::vector<int>& strides,
+                const std::vector<int>& dilations) {
+  // test float
+  TestConv3dBase<float>(indices,
+                        features,
+                        x_dims,
+                        kernel,
+                        kernel_dims,
+                        correct_out_indices,
+                        correct_out_features,
+                        correct_out_dims,
+                        non_zero_num,
+                        paddings,
+                        strides,
+                        dilations);
+}
+
+TEST(API, sparse_conv2d) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  const int in_channels = 1;
+  const int out_channels = 1;
+  phi::DDim x_dims = {1, 1, 5, 5, in_channels};
+  phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  phi::DDim out_dims = {1, 1, 3, 3, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
+
+  std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
+  // 3*3*3=27
+  std::vector<float> kernel = {0.65820312,
+                               0.75048828,
+                               0.21411133,
+                               0.17370605,
+                               0.85546875,
+                               0.53076172,
+                               0.28833008,
+                               0.71044922,
+                               0.00659943};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
+
+  std::vector<float> out_features = {
+      -0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index 819122a9b36650f8aa793b7d47ac2c0b6feb8f8e..8595782be35ab677ef40eb31b8a09237e90f359a 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+
 TEST(API, to_sparse_coo) {
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 0b836a010586d775ced1e7c196b1b5139ac42fc1..1b84e7793cf6a864aaa3d9aa5ae7be8144e68c07 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_sum_api.cc b/paddle/phi/tests/api/test_sum_api.cc
index 80620b8e61c57668c8b090af798fa04c0da4779b..9781d70d2b9136071d4dde9a947af14f8d8b8e87 100644
--- a/paddle/phi/tests/api/test_sum_api.cc
+++ b/paddle/phi/tests/api/test_sum_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index d337a0b601a00d6ae0423b7184d4d2c5cc6ef2b8..66c478e4c000150b8bfd50d6cb5b79e90d7a5d80 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -21,6 +21,11 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
 namespace paddle {
 namespace tests {
 
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 5356bac9fbd808f1f75eb13f4406d6d0661e60bd..de9bd7a4d479c88bc747ee8933e65689948a4c3d 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel)
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index 69922c055cbac5fe3c3947d0d8d63ee4a1262a4c..6fe34a6891a35efb0af7e1785a2341988142be79 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -43,7 +43,7 @@ template <typename T, typename Context>
 void FakeDot(const Context& dev_ctx,
              const phi::DenseTensor& x,
              const phi::DenseTensor& y,
-             const std::vector<phi::DenseTensor>& fake_input_vec,
+             const std::vector<const phi::DenseTensor*>& fake_input_vec,
              bool fake_attr_bool,
              int fake_attr_int,
              float fake_attr_float,
@@ -172,7 +172,9 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
+
+  EXPECT_EQ(0, static_cast<int>(custom_fake_dot_kernels.size()));
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc
index f4288c2aa2f9418eeff489aa53fe685aa4a155ec..399112d09c2ad55364b5035e7b759b53d0abaea8 100644
--- a/paddle/phi/tests/core/test_meta_fn_utils.cc
+++ b/paddle/phi/tests/core/test_meta_fn_utils.cc
@@ -52,7 +52,7 @@ TEST(MetaFnFactory, InferMetaFnExists) {
   phi::InferMetaContext ctx;
   ctx.EmplaceBackInput(shared_meat_x);
   ctx.EmplaceBackOutput(shared_meta_out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("sign")(&ctx);
 
   EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size());
@@ -78,7 +78,7 @@ TEST(MetaFnFactory, CopyInferMetaFn) {
   ctx.EmplaceBackAttr(Backend::CPU);
   ctx.EmplaceBackAttr(false);
   ctx.EmplaceBackOutput(shared_meta_out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("copy_to")(&ctx);
 
   EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size());
@@ -105,7 +105,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) {
   ctx.EmplaceBackAttr(num_or_sections);
   ctx.EmplaceBackAttr(axis);
   ctx.EmplaceBackOutputs(out);
-  ctx.SetMetaConfig(/*is_runtime=*/true);
+  ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false});
   phi::MetaFnFactory::Instance().Get("split")(&ctx);
 
   ASSERT_EQ(dense_out1.dims().size(), 2);
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index c92e10f8dd74af072bb8836d65898e2fc9a79bcc..317dcce92c8edd1bb76b080cdb578d37eb8b1f58 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -22,3 +22,5 @@ endif()
 if(WITH_ROCM)
     hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function)
 endif()
+
+cc_test(test_cpu_vec SRCS test_cpu_vec.cc DEPS blas cpu_info)
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 55dd6dce1aaaeddf7e1acfd1c458b15042d5a629..7f954085f601cdc0321fb133abd716112367b113 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -53,7 +53,7 @@ TEST(DEV_API, concat) {
     }
   }
 
-  std::vector<phi::DenseTensor> inputs = {dense_x, dense_y};
+  std::vector<const phi::DenseTensor*> inputs = {&dense_x, &dense_y};
 
   // 2. test API
   phi::CPUContext dev_ctx;
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/phi/tests/kernels/test_cpu_vec.cc
similarity index 75%
rename from paddle/fluid/operators/math/cpu_vec_test.cc
rename to paddle/phi/tests/kernels/test_cpu_vec.cc
index 859afec3781ef68b308b584aca8055367632de56..271143f9f6f1e710aa95fa8e9be5821b8d191734 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/phi/tests/kernels/test_cpu_vec.cc
@@ -18,7 +18,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
+
+namespace phi {
+namespace tests {
 
 inline double GetCurrentUS() {
   struct timeval time;
@@ -62,7 +65,9 @@ void ref_relu(const int n, const T* x, T* y) {
 }
 
 template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+void RandomVec(const int n,
+               T* a,
+               const T lower = static_cast<T>(-20.f),
                const T upper = static_cast<T>(20.f)) {
   static unsigned int seed = 100;
   std::mt19937 rng(seed++);
@@ -73,7 +78,8 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
 }
 
 template <typename T>
-void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
+void TestAndBench(const int n,
+                  std::function<void(const int, const T*, T*)> tgt,
                   std::function<void(const int, const T*, T*)> ref) {
   std::vector<T> x(n);
   std::vector<T> ytgt(n), yref(n);
@@ -101,47 +107,48 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
 
 TEST(CpuVecTest, sigmoid) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
-                        ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                        ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>);
+    TestAndBench<float>(
+        sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>);
   }
   TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, tanh) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
-                        ref_tanh<float>);
+    TestAndBench<float>(
+        sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
   }
   TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
 
 TEST(CpuVecTest, relu) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
-                        ref_relu<float>);
+    TestAndBench<float>(
+        sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
 
 template <typename T>
-void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
+void compare_sum(size_t n,
+                 std::function<void(const size_t, const T*, T*)> tgt,
                  std::function<void(const size_t, const T*, T*)> ref) {
   std::vector<T> x(n);
   T ytgt_data, yref_data;
@@ -155,18 +162,19 @@ void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
 
 TEST(CpuVecTest, vec_sum) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     compare_sum<float>(sz, vec_sum<float>, vec_sum<float, platform::isa_any>);
-    compare_sum<float>(sz, vec_sum<float, platform::avx>,
-                       vec_sum<float, platform::isa_any>);
+    compare_sum<float>(
+        sz, vec_sum<float, platform::avx>, vec_sum<float, platform::isa_any>);
   }
   compare_sum<double>(30U, vec_sum<double>, vec_sum<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_clip(
-    size_t n, T threshold,
+    size_t n,
+    T threshold,
     std::function<void(const size_t, const T, const T*, T*)> tgt,
     std::function<void(const size_t, const T, const T*, T*)> ref) {
   std::vector<T> x(n);
@@ -185,20 +193,23 @@ void compare_clip(
 
 TEST(CpuVecTest, vec_clip) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_clip<float>(sz, -4.f, vec_clip<float>,
-                        vec_clip<float, platform::isa_any>);
-    compare_clip<float>(sz, -1.1f, vec_clip<float, platform::avx>,
+    compare_clip<float>(
+        sz, -4.f, vec_clip<float>, vec_clip<float, platform::isa_any>);
+    compare_clip<float>(sz,
+                        -1.1f,
+                        vec_clip<float, platform::avx>,
                         vec_clip<float, platform::isa_any>);
   }
-  compare_clip<double>(30U, 1.0, vec_clip<double>,
-                       vec_clip<double, platform::isa_any>);
+  compare_clip<double>(
+      30U, 1.0, vec_clip<double>, vec_clip<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_mul(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
+    size_t n,
+    std::function<void(const size_t, const T*, const T*, T*)> tgt,
     std::function<void(const size_t, const T*, const T*, T*)> ref) {
   std::vector<T> x(n), y(n);
   std::vector<T> ztgt(n), zref(n);
@@ -220,18 +231,19 @@ void compare_mul(
 
 TEST(CpuVecTest, vec_mul) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
-    compare_mul<float>(sz, vec_mul<float, platform::avx>,
-                       vec_mul<float, platform::isa_any>);
+    compare_mul<float>(
+        sz, vec_mul<float, platform::avx>, vec_mul<float, platform::isa_any>);
   }
   compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
 }
 
 template <typename T>
 void compare_mul_reduce(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
+    size_t n,
+    std::function<void(const size_t, const T*, const T*, T*)> tgt,
     std::function<void(const size_t, const T*, const T*, T*)> ref) {
   std::vector<T> x(n), y(n);
   T ztgt_data, zref_data;
@@ -249,19 +261,21 @@ void compare_mul_reduce(
 
 TEST(CpuVecTest, vec_mul_reduce) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
-                              vec_mul_reduce<float, platform::isa_any>);
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
+    compare_mul_reduce<float>(
+        sz, vec_mul_reduce<float>, vec_mul_reduce<float, platform::isa_any>);
+    compare_mul_reduce<float>(sz,
+                              vec_mul_reduce<float, platform::avx>,
                               vec_mul_reduce<float, platform::isa_any>);
   }
-  compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
-                             vec_mul_reduce<double, platform::isa_any>);
+  compare_mul_reduce<double>(
+      30U, vec_mul_reduce<double>, vec_mul_reduce<double, platform::isa_any>);
 }
 
 template <typename T>
-void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
+void TestInplace(const int n,
+                 std::function<void(const int, const T*, T*)> tgt,
                  std::function<void(const int, const T*, T*)> ref) {
   std::vector<T> x(n);
   std::vector<T> ytgt(n), yref(n);
@@ -283,22 +297,22 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
 
 TEST(CpuVecTest, inplace_sigmoid) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
-                       ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
-                       ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx>, ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx2>, ref_sigmoid<float>);
+    TestInplace<float>(
+        sz, vec_sigmoid<float, platform::avx512f>, ref_sigmoid<float>);
   }
   TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, inplace_tanh) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
@@ -310,7 +324,7 @@ TEST(CpuVecTest, inplace_tanh) {
 
 TEST(CpuVecTest, inplace_relu) {
   namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
+  using namespace phi::funcs;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
@@ -319,3 +333,5 @@ TEST(CpuVecTest, inplace_relu) {
   }
   TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 00b2a256a9504595dd8ac4ffd492564557f2d783..37a69a176c6e1ded81a8449da3c571442bd94e78 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
@@ -63,7 +64,8 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const float diff = 1e-3,
                     const bool backward = false,
                     const std::vector<T> features_grad = {},
-                    const std::vector<T> kernel_grad = {}) {
+                    const std::vector<T> kernel_grad = {},
+                    const bool subm = false) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -77,9 +79,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor indices_tensor = phi::Empty(
       dev_ctx_cpu,
       DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
-  dev_ctx_cpu.Alloc(&indices_tensor,
-                    indices_tensor.dtype(),
-                    sizeof(int) * indices_tensor.numel());
   memcpy(
       indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
   DenseTensor features_tensor = phi::Empty(
@@ -87,9 +86,6 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       {non_zero_num, in_channels},
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(&features_tensor,
-                    features_tensor.dtype(),
-                    features_tensor.numel() * sizeof(T));
   memcpy(
       features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
 
@@ -100,12 +96,18 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       kernel_dims,
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(
-      &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T));
   memcpy(kernel_tensor.data<T>(), kernel.data(), kernel.size() * sizeof(T));
 
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty<int, phi::CPUContext>(dev_ctx_cpu);
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
     SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
                                             x_tensor,
                                             kernel_tensor,
@@ -113,6 +115,7 @@ void TestConv3dBase(const std::vector<int>& indices,
                                             dilations,
                                             strides,
                                             1,
+                                            subm,
                                             &rulebook);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
@@ -126,15 +129,6 @@ void TestConv3dBase(const std::vector<int>& indices,
                              correct_out_indices.size() * sizeof(int));
     ASSERT_EQ(cmp_indices, 0);
 
-    auto f_verify = [&](const T* real_data,
-                        const std::vector<T>& correct_data) {
-      for (uint64_t i = 0; i < correct_data.size(); i++) {
-        float tmp =
-            std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
-        ASSERT_LT(tmp, diff);
-      }
-    };
-
     f_verify(out.non_zero_elements().data<T>(), correct_out_features);
 
     if (backward) {
@@ -146,11 +140,120 @@ void TestConv3dBase(const std::vector<int>& indices,
                                                              paddings,
                                                              dilations,
                                                              strides,
-                                                             1);
+                                                             1,
+                                                             subm);
       f_verify(grads[0].data<T>(), features_grad);
       f_verify(grads[1].data<T>(), kernel_grad);
     }
   }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_kernel_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
+
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
+                                            d_x_tensor,
+                                            d_kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            subm,
+                                            &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                                                           d_x_tensor,
+                                                           d_rulebook,
+                                                           d_kernel_tensor,
+                                                           d_out,
+                                                           paddings,
+                                                           dilations,
+                                                           strides,
+                                                           1,
+                                                           subm);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
+    phi::Copy(dev_ctx_gpu, grads[0], phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+
+    DenseTensor h_kernel_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[1].dtype(), grads[1].dims(), grads[1].layout()));
+    phi::Copy(dev_ctx_gpu, grads[1], phi::CPUPlace(), true, &h_kernel_grad);
+    f_verify(h_kernel_grad.data<T>(), kernel_grad);
+  }
+#endif
 }
 
 void TestConv3d(const std::vector<int>& indices,
@@ -168,7 +271,8 @@ void TestConv3d(const std::vector<int>& indices,
                 const float diff = 1e-3,
                 const bool backward = false,
                 const std::vector<float> features_grad = {},
-                const std::vector<float> kernel_grad = {}) {
+                const std::vector<float> kernel_grad = {},
+                const bool subm = false) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -185,7 +289,8 @@ void TestConv3d(const std::vector<int>& indices,
                         diff,
                         backward,
                         features_grad,
-                        kernel_grad);
+                        kernel_grad,
+                        subm);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -202,7 +307,8 @@ void TestConv3d(const std::vector<int>& indices,
                          diff,
                          backward,
                          cast<float, double>(features_grad),
-                         cast<float, double>(kernel_grad));
+                         cast<float, double>(kernel_grad),
+                         subm);
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -563,5 +669,101 @@ TEST(DEV_API, sparse_conv3d_backward) {
              kernel_grad);
 }
 
+TEST(DEV_API, sparse_conv2d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 1, 4, 5, in_channels};
+  DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 1, 4, 5, out_channels};
+  std::vector<int> paddings = {0, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 4;
+  std::vector<int> indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> features = {0.8854, 0.6505, -0.1999, 0.3583};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.9364, 0.9460, 0.6564, 0.7999, 0.2013, 0.3812, 0.5474, 0.1016, 0.3368};
+
+  std::vector<int> out_indices_flatten = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3};
+
+  std::vector<float> out_features = {0.1782, 0.2313, 0.7117, 0.5214};
+
+  std::vector<float> features_grad = {0.0359, 1.2080, 0.5838, 0.4541};
+  std::vector<float> kernel_grad = {
+      0.3391, 0.4630, 0.0000, -0.1042, 0.3528, 0.2550, 0.0000, -0.0462, 0.0829};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
+TEST(DEV_API, sparse_conv3d_subm) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 5, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 4, 4, 5, out_channels};
+  std::vector<int> paddings = {1, 1, 1};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> features = {-0.9578, 0.1572, 0.1036};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.1367, 0.4534, 0.2138, 0.8264, 0.7534, 0.3270, 0.2880, 0.1562, 0.7770,
+      0.6902, 0.1981, 0.1369, 0.6582, 0.7582, 0.5640, 0.8894, 0.7350, 0.1845,
+      0.6892, 0.3654, 0.6076, 0.0326, 0.8412, 0.5289, 0.9824, 0.8235, 0.9802};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1};
+
+  std::vector<float> out_features = {-0.7262, 0.1192, 0.0785};
+
+  std::vector<float> features_grad = {-0.5506, 0.0904, 0.0595};
+  std::vector<float> kernel_grad = {
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.7224, 0.0000, 0.0000, 0.0000, 0.0000,
+      0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad,
+             true);
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index 3e2ad0495f3ba85836dc08afa3f4fa4ed0b10afd..b8f214b79e290c2e102fc2c08dab2ddc6a61dd71 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -90,6 +90,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
 
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out =
@@ -300,6 +304,11 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_cpu, csr);
   CheckResult<T, int64_t>(&dev_ctx_cpu,
                           cpu_sparse_out,
@@ -473,6 +482,11 @@ void TestCooToCsr(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_cpu, coo);
   CheckCsrResult<T, int64_t>(&dev_ctx_cpu,
                              cpu_sparse_out,
@@ -563,6 +577,11 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
@@ -667,6 +686,11 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           const int64_t non_zero_num,
                           const int64_t sparse_dim) {
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
@@ -836,6 +860,11 @@ void TestSparseCsrToDense(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   DenseTensor cpu_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_cpu, csr);
   int cmp_cpu = memcmp(cpu_sparse_out.data<T>(),
                        dense_data.data(),
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index a6c9a27de7dc5a3a9f09d4c336fe9b50e4d453a5..36923972ea4145a63101f84eeb5da76d73ffce75 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -114,5 +114,468 @@ TEST(ARG_MAP, fill_constant) {
   ASSERT_EQ(signature9.name, "full_sr");
 }
 
+TEST(ARG_MAP, set_value) {
+  TestArgumentMappingContext arg_case(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case1(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case1).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case2).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case3(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case3).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case4(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case4).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case5(
+      {"Input", "StartsTensorList", "EndsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case5).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case6(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case6).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case7(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case7).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case8(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case8).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case9(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case9).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case10(
+      {"Input", "StartsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case10).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case11(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case11).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case12(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case12).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case13(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case13).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case14(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case14).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case15(
+      {"Input", "StartsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case15).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case16(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case16).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case17(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case17).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case18(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case18).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case19(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case19).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case20(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case20).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case21(
+      {"Input", "EndsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case21).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case22(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case22).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case23(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case23).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case24(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case24).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case25(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case25).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case26(
+      {"Input", "EndsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case26).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case27(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case27).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case28(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case28).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case29(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case29).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case30(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case30).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case31(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case31).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case32(
+      {"Input", "StepsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case32).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case33(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case33).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case34(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case34).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case35(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case35).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case36(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case36).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case37(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case37).name,
+      "set_value");
+}
+
+TEST(ARG_MAP, set_value_grad) {
+  TestArgumentMappingContext arg_case(
+      {"Out@GRAD", "StartsTensorList", "EndsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case1(
+      {"Out@GRAD", "StartsTensorList", "StepsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case1)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case2({"Out@GRAD", "StartsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case2)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case3(
+      {"Out@GRAD", "EndsTensorList", "StepsTensorList"},
+      {},
+      {},
+      {"Input@GRAD", "ValueTensor@GRAD"},
+      {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case3)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case4({"Out@GRAD", "EndsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case4)
+                .name,
+            "set_value_grad");
+
+  TestArgumentMappingContext arg_case5({"Out@GRAD", "StepsTensorList"},
+                                       {},
+                                       {},
+                                       {"Input@GRAD", "ValueTensor@GRAD"},
+                                       {});
+  ASSERT_EQ(OpUtilsMap::Instance()
+                .GetArgumentMappingFn("set_value_grad")(arg_case5)
+                .name,
+            "set_value_grad");
+}
+
+TEST(ARG_MAP, allclose) {
+  TestArgumentMappingContext arg_case1(
+      {"Input", "Other", "Rtol"},
+      {},
+      {{"atol", paddle::any(std::string{"1e-8"})},
+       {"equal_nan", paddle::any(false)}},
+      {"Out"},
+      {});
+  auto signature1 =
+      OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case1);
+  ASSERT_EQ(signature1.name, "allclose");
+  auto attr_names1 = std::get<1>(signature1.args);
+  ASSERT_EQ(attr_names1[0], "Rtol");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "Other", "Atol"},
+      {},
+      {{"rtol", paddle::any(std::string{"1e-5"})},
+       {"equal_nan", paddle::any(false)}},
+      {"Out"},
+      {});
+  auto signature2 =
+      OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case2);
+  ASSERT_EQ(signature2.name, "allclose");
+  auto attr_names2 = std::get<1>(signature2.args);
+  ASSERT_EQ(attr_names2[1], "Atol");
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 75b27e4165d177c2285a6b109922ce159eeb8b82..76b45ff89f1869cc5e401b5c1b4151ad14158259 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -32,17 +32,18 @@ function update_pd_ops() {
    # compile and install paddle
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-   cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
-   make -j8 paddle_python print_pten_kernels
+   cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
+   make -j8 paddle_python print_pten_kernels kernel_signature_generator
    cd ${PADDLE_ROOT}/build
    ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
+   ./paddle/fluid/pybind/kernel_signature_generator > ../tools/infrt/kernel_signature.json
    cd python/dist/
    python3 -m pip uninstall -y paddlepaddle
    python3 -m pip install  *whl
    # update pd_ops.td
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
-   python3 generate_phi_kernel_dialect.py ./kernels.json
+   python3 generate_phi_kernel_dialect.py
 }
 
 function init() {
@@ -92,7 +93,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 35b2ce751b18fff2aac8dedfd09e5fe209d95533..75afa4ef43ff602d0fe4b9a6ce7c7c6ad5aab8a0 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -66,6 +66,7 @@ if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
+if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
@@ -757,7 +758,7 @@ for /F %%i in ("%libsize%") do (
 )
 
 cd /d %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
@@ -857,7 +858,7 @@ echo    Step 7. Testing fluid library with infer_ut for inference ...
 echo    ========================================
 
 cd /d %work_dir%\paddle\fluid\inference\tests\infer_ut
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_ut_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 41e5e0469dcb4099a89a0517207b8f6d8d39632b..39676b916e50470ac9774f3564b4bdc3a8fcb20f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -229,6 +229,7 @@ function cmake_base() {
         -DWITH_CNCL=${WITH_CNCL:-OFF}
         -DWITH_XPU=${WITH_XPU:-OFF}
         -DWITH_MLU=${WITH_MLU:-OFF}
+        -DWITH_IPU=${WITH_IPU:-OFF}
         -DLITE_GIT_TAG=release/v2.10
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
@@ -242,6 +243,7 @@ function cmake_base() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} 
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF}
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}"
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -279,6 +281,7 @@ EOF
         -DLITE_GIT_TAG=release/v2.10 \
         -DWITH_XPU=${WITH_XPU:-OFF} \
         -DWITH_MLU=${WITH_MLU:-OFF} \
+        -DWITH_IPU=${WITH_IPU:-OFF} \
         -DWITH_CNCL=${WITH_CNCL:-OFF} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
@@ -293,7 +296,9 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}" \
         -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} \
-        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
+        -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}  \
+        -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
+        
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -330,7 +335,7 @@ function check_style() {
 
     # pre-commit use python3.8.0 
     OLD_PATH=$PATH
-    export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+    export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
 
     pre-commit install
     clang-format --version
@@ -945,8 +950,17 @@ function generate_upstream_develop_api_spec() {
     git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
     startTime_firstBuild=`date +%s`
-    cmake_gen $1
-    build $2
+
+    dev_commit=`git log -1|head -1|awk '{print $2}'`
+    dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
+    url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
+    if [ "$url_return" == '200' ];then
+        mkdir -p ${PADDLE_ROOT}/build/python/dist && wget -q -P ${PADDLE_ROOT}/build/python/dist ${dev_url}
+    else
+        cmake_gen $1
+        build $2
+    fi
+
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
     pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
     echo "pr_whl_size: ${pr_whl_size}"
@@ -1271,6 +1285,8 @@ function card_test() {
         CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
     elif [ "${WITH_MLU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
+    elif [ "${WITH_IPU}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -2228,6 +2244,130 @@ set -ex
     fi   
 }
 
+function parallel_test_base_ipu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit ipu tests ...
+    ========================================
+EOF
+
+set +x
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+
+        ut_actual_total_startTime_s=`date +%s`
+
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single IPU
+        collect_failed_tests
+
+        # add unit test retry for IPU
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                fi
+
+                            done
+
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                    else 
+                        break
+                    fi
+
+                done
+        fi
+
+        rerun_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -2245,6 +2385,8 @@ function parallel_test() {
         parallel_test_base_npu
     elif [ "$WITH_MLU" == "ON" ];then
         parallel_test_base_mlu
+    elif [ "$WITH_IPU" == "ON" ];then
+        parallel_test_base_ipu
     else
         parallel_test_base_cpu ${PROC_RUN:-1}
     fi
@@ -2504,7 +2646,8 @@ EOF
     fi
     startTime_s=`date +%s`
     set +e
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON};build_error=$?
+
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF};build_error=$?
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
@@ -2548,7 +2691,7 @@ EOF
     demo_ci_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr}
+             ${WITH_TENSORRT:-ON} ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     DEMO_EXIT_CODE=$?
     ./clean.sh
     demo_ci_endTime_s=`date +%s`
@@ -2558,7 +2701,7 @@ EOF
     infer_ut_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/tests/infer_ut
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_ROOT_DIR:-/usr}
+             ${TENSORRT_ROOT_DIR:-/usr} ${WITH_ONNXRUNTIME:-ON}
     TEST_EXIT_CODE=$?
     infer_ut_endTime_s=`date +%s`
     echo "infer_ut tests Total time: $[ $infer_ut_endTime_s - $infer_ut_startTime_s ]s"
@@ -2754,17 +2897,20 @@ function build_pr_and_develop() {
     fi
 
     git fetch upstream develop
+    git checkout develop
     dev_commit=`git log -1|head -1|awk '{print $2}'`
     dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
     url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'`
     if [ "$url_return" == '200' ];then
-        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+        mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
+        cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist
     else
         git checkout -b develop_base_pr upstream/$BRANCH
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         generate_api_spec "$1" "DEV"
         mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
     fi
+
 }
 
 function build_develop() {
@@ -3006,6 +3152,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_ipu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index eace7c41f4a3111e637bc20f02a4a55bccd46092..0cc68bf31617c4894a27ece3749ed643b34a52a1 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,5 +1,5 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc)
 endif()
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9ca29d509f60e5c3eb435e816a32cc14aa92e921..5132f23079f1fe6d76a1fc70ff9da229ec96abee 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -107,9 +107,9 @@ def decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -122,7 +122,7 @@ def decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 36ca048c51210ff7c12679731653ce026206b3c6..6fc6f7d3d494a28b822c3044716ec66867538a3d 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -81,15 +81,14 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} connot be empyt".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var, paddle.
-                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+                assert isinstance(each_var, (
+                    paddle.Tensor, core.eager.Tensor
+                )), "Elements of {} must be paddle.Tensor".format(name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
-                    name)
+            assert isinstance(in_out_list, (
+                paddle.Tensor, core.eager.Tensor
+            )), "{} must be Tensor or list of Tensor".format(name)
             return [in_out_list]
 
     tensors = check_tensors(tensors, "tensors")
@@ -105,10 +104,13 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
         for each_tensor in grad_tensors:
             if each_tensor is not None:
                 assert isinstance(
-                    each_tensor, paddle.Tensor
+                    each_tensor, (paddle.Tensor, core.eager.Tensor)
                 ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
     else:
-        grad_tensors = [None] * len(tensors)
+        if core._in_eager_mode():
+            grad_tensors = []
+        else:
+            grad_tensors = [None] * len(tensors)
 
     if len(grad_tensors) > 0:
         assert len(tensors) == len(
@@ -116,5 +118,8 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
 
     assert isinstance(retain_graph, bool), "retain_graph must be True or False"
 
-    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
-                              framework._dygraph_tracer())
+    if core._in_eager_mode():
+        core.eager.run_backward(tensors, grad_tensors, retain_graph)
+    else:
+        core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                                  framework._dygraph_tracer())
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index fc299bc7b552b7a4ad6b2d9f1a9e8e5c3aefc560..a0ae9bc29dabe2172bfec4853315b4d6eb20c15b 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -55,6 +55,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
 from . import cloud_utils  # noqa: F401
 from . import utils  # noqa: F401
 
+from .sharding import *  # noqa: F401
 
 __all__ = [  # noqa
       "spawn",
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..d88f9fe7501b56be255448a412fdcc6ec56cd13b
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -0,0 +1,455 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import warnings
+import logging
+import numpy as np
+from ..utils import get_logger
+
+
+class Converter(object):
+    """
+    Converter is a class object for auto parallel to convert tensors from 
+    one parallel strategy to another one. Tensors will merge and slice value 
+    with their strategy when strategies are different.
+    """
+
+    def __init__(self, tensors_dict, pre_strategy, cur_strategy):
+        """
+        Args:
+            tensors_dict(dict): tensors' value of all ranks that to be converted. 
+                key is tensor's name(str), value is all ranks' data(list(numpy.ndarray))
+            pre_strategy(dict): tensors' distributed attribute of last training process.
+                key is tensor's name(str), value is tensor's distributed attribute in last 
+                training process.
+            cur_strategy(dict): tensors' distributed attribute of current rank.
+                key is tensor's name(str), value is tensor's distributed attribute in current
+                rank.
+        """
+        self._tensors_dict = self._check_tensor_dict(tensors_dict)
+        self._pre_strategy = self._check_pre_strategy(pre_strategy)
+        self._cur_strategy = self._check_cur_strategy(cur_strategy)
+        self._logger = get_logger(logging.INFO)
+
+    def _check_tensor_dict(self, tensors_dict):
+        if not tensors_dict:
+            raise ValueError("'tensors_dict' is None, "
+                             "the tensors to be converted cannot be None.")
+        if not isinstance(tensors_dict, dict):
+            raise TypeError(
+                "The type of 'tensors_dict' should be 'dict', but got '{}'.".
+                format(str(type(tensors_dict))))
+        return tensors_dict
+
+    def _check_pre_strategy(self, pre_strategy):
+        if not pre_strategy:
+            raise ValueError("'pre_strategy' is None, "
+                             "there are not tensors in pre process.")
+        if not isinstance(pre_strategy, dict):
+            raise TypeError("The type of 'pre_strategy' should be 'dict', "
+                            "but got '{}'.".format(str(type(pre_strategy))))
+        return pre_strategy
+
+    def _check_cur_strategy(self, cur_strategy):
+        if not cur_strategy:
+            warnings.warn("'cur_strategy' is None, "
+                          "there are not tensors in cur process")
+        if not isinstance(cur_strategy, dict):
+            raise TypeError("The type of 'cur_strategy' should be 'dict', "
+                            "but got '{}'.".format(str(type(cur_strategy))))
+        return cur_strategy
+
+    def convert(self, strict=True):
+        """
+        Convert tensors
+
+        Args:
+            strict(bool): whether to strict convert tensor with tensor's name. If False, it will
+            convert tensors by prefix matching. Otherwise, tensors will be converted with
+            their name strictly.
+
+        Returns:
+            converted tensors(dict)
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensors = np.arange(4).reshape([2, 2])
+                partitial_tensors = np.split(complete_tensors, 2, axis=0)
+                name = "tmp_0"
+                tensors_dict = {name: partitial_tensors}
+                strategy_1 = {
+                    name: {
+                        "process_shape": [2],
+                        "process_group": [0, 1],
+                        "dims_mapping": [0, -1]
+                    }
+                }
+                strategy_2 = {
+                    name: {
+                        "process_shape": [2],
+                        "process_group": [0, 1],
+                        "dims_mapping": [-1, -1]
+                    }
+                }
+                converter = Converter(tensors_dict, strategy_1, strategy_2)
+                result = converter.convert()
+                # the result's value is equal to `complete_tensors`
+        """
+        tensors_dict = {}
+        # the name which is in cur_process but not in pre_process
+        tensor_not_in_pre = []
+        # the name which is in pre_process but not in cur_process
+        tensor_not_in_cur = []
+        # the name which is in strategy but not in ckpt files
+        tensor_not_in_ckpt = []
+        self._logger.info("Start to convert tensors.")
+        for tensor_name in self._cur_strategy:
+            if tensor_name not in self._pre_strategy:
+                tensor_not_in_pre.append(tensor_name)
+                continue
+            if tensor_name not in self._tensors_dict:
+                tensor_not_in_ckpt.append(tensor_name)
+                continue
+            self._pre_name = tensor_name
+            self._cur_name = tensor_name
+            tensor_list = self._tensors_dict[tensor_name]
+            pre_dist_attr = self._pre_strategy[tensor_name]
+            cur_dist_attr = self._cur_strategy[tensor_name]
+            try:
+                tensors_dict[tensor_name] = Converter.merge_and_slice(
+                    tensor_list, pre_dist_attr, cur_dist_attr)
+            except ValueError as err:
+                raise ValueError("Fail to convert tensor '{}'. "
+                                 .format(str(tensor_name)) + str(err))
+
+        for tensor_name in self._pre_strategy:
+            if tensor_name not in self._cur_strategy:
+                tensor_not_in_cur.append(tensor_name)
+
+        if not strict:
+            tensors_dict, tensor_match_with_pre, tensor_match_with_cur = self.convert_with_prefix_match(
+                tensors_dict, tensor_not_in_pre, tensor_not_in_cur)
+        else:
+            tensors_dict, tensor_match_with_pre, tensor_match_with_cur = tensors_dict, [], []
+
+        tensor_not_in_pre = set(tensor_not_in_pre) - set(tensor_match_with_pre)
+        tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur)
+        if tensor_not_in_pre:
+            warnings.warn(
+                "tensors [{}] are not found in last training strategy."
+                .format(str(tensor_not_in_pre)))
+        if tensor_not_in_cur:
+            warnings.warn(
+                "tensors [{}] are not found in current training strategy."
+                .format(str(tensor_not_in_cur)))
+        if tensor_not_in_ckpt:
+            warnings.warn(
+                "tensors [{}] are found in pre_strategy, but are not found"
+                "in checkpoint files, please check your checkpoint files."
+                .format(str(tensor_not_in_ckpt)))
+
+        return tensors_dict
+
+    def convert_with_prefix_match(self, tensors_dict, tensor_not_in_pre,
+                                  tensor_not_in_cur):
+        # the name which in cur_process and can match with pre_process
+        tensor_match_with_pre = []
+        # the name which in pre_process and can match with cur_process
+        tensor_match_with_cur = []
+        for cur_name in tensor_not_in_pre:
+            prefix_name = cur_name
+            while prefix_name.find("_") != -1:
+                prefix_name = prefix_name[:prefix_name.rfind("_")]
+                for pre_name in tensor_not_in_cur:
+                    if prefix_name in pre_name:
+                        # 'cur_name' of cur_process can match with 'pre_name' of pre_process
+                        self._pre_name = pre_name
+                        self._cur_name = cur_name
+                        pre_tensor_list = self._tensors_dict[pre_name]
+                        pre_dist_attr = self._pre_strategy[pre_name]
+                        cur_dist_attr = self._cur_strategy[cur_name]
+                        try:
+                            tensors_dict[cur_name] = Converter.merge_and_slice(
+                                pre_tensor_list, pre_dist_attr, cur_dist_attr)
+                        except ValueError as err:
+                            raise ValueError(
+                                "Fail to convert tensor '{}' by '{}'. ".format(
+                                    str(cur_name), str(pre_name)) + str(err))
+                        self._logger.info(
+                            "tensor [{}] is matched with tensor [{}]".format(
+                                cur_name, pre_name))
+                        tensor_match_with_pre.append(cur_name)
+                        tensor_match_with_cur.append(pre_name)
+                        break
+                break
+
+        return tensors_dict, tensor_match_with_pre, tensor_match_with_cur
+
+    @staticmethod
+    def merge_and_slice(tensor_list, pre_dist_attr, cur_dist_attr):
+        """
+        Merge tensors with previous dist_attr and slice tensors with current dist_attr
+
+        Returns:
+            tensor(numpy.narray): a tensor's value of current rank.
+        """
+        assert isinstance(tensor_list, list)
+        assert all(isinstance(p, np.ndarray) for p in tensor_list)
+
+        if pre_dist_attr == cur_dist_attr:
+            # skip merge and slice tensor
+            rank_id = paddle.distributed.get_rank()
+            index = cur_dist_attr["process_group"].index(rank_id)
+            tensor = tensor_list[index]
+        else:
+            pre_dims_mapping = pre_dist_attr["dims_mapping"]
+            cur_dims_mapping = cur_dist_attr["dims_mapping"]
+            if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping:
+                # merge tensor
+                tensor = Converter.merge_with_dist_attr(tensor_list,
+                                                        pre_dist_attr)
+            else:
+                # skip merge tensor
+                tensor = tensor_list[0]
+
+            if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
+                # slice tensor
+                tensor = Converter.slice_with_dist_attr(tensor, cur_dist_attr)
+
+        return tensor
+
+    @staticmethod
+    def merge_with_dist_attr(tensor_list, dist_attr):
+        """ Merge tensor with distributed attribute """
+        from .reshard import _compute_complete_shape, _compute_partition_index
+
+        dims_mapping = dist_attr["dims_mapping"]
+        process_shape = dist_attr["process_shape"]
+        process_group = dist_attr["process_group"]
+        # get the complete shape of the tensor
+        complete_shape = _compute_complete_shape(tensor_list[0].shape,
+                                                 process_shape, dims_mapping)
+        # merge the tensor with dist_attr
+        partition_tensor_list = []
+        merged_partiton = []
+        for process in process_group:
+            partition_index = _compute_partition_index(
+                process, complete_shape, dims_mapping, process_shape,
+                process_group)
+            index = process_group.index(process)
+            if partition_index not in merged_partiton:
+                merged_partiton.append(partition_index)
+                Converter.merge(partition_tensor_list, tensor_list[index],
+                                partition_index, complete_shape)
+
+        if len(partition_tensor_list) != 1:
+            raise ValueError("Fail to merge tensor with dist_attr '{}'.".format(
+                str(dist_attr)))
+        complete_tensor = partition_tensor_list[0][0]
+        return complete_tensor
+
+    @staticmethod
+    def slice_with_dist_attr(tensor, dist_attr):
+        """ Slice tensor with distributed attribute """
+        dims_mapping = dist_attr["dims_mapping"]
+        process_shape = dist_attr["process_shape"]
+        process_group = dist_attr["process_group"]
+        # slice the tensor with dist_attr
+        partition_index_list = Converter._get_split_indices(
+            tensor.shape, dims_mapping, process_shape, process_group)
+        sliced_tensor_list = Converter.split(tensor, partition_index_list,
+                                             len(partition_index_list))
+        # get the current tensor's index in sliced_tensor_list
+        rank_id = paddle.distributed.get_rank()
+        sliced_tensor_index = Converter._get_sliced_index(
+            rank_id, tensor.shape, dims_mapping, process_shape, process_group)
+        if sliced_tensor_index not in range(len(sliced_tensor_list)):
+            raise ValueError("Fail to slice tensor with dist_attr '{}'.".format(
+                str(dist_attr)))
+        sliced_tensor = sliced_tensor_list[sliced_tensor_index]
+        return sliced_tensor
+
+    @staticmethod
+    def merge(partition_tensor_list, tensor, partition_index, complete_shape):
+        """
+        Merge partitial tensors to a complete.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                partition_tensor_list = [(np.array([[[1.11, 1.12]]]), [[0,1],[0,1],[0,2]])]
+                tensor = np.array([[[1.13, 1.14]]])
+                partition_index = [[0,1],[0,1],[2,4]]
+
+                _merge_tensor(partition_tensor_list, tensor, partition_index)
+                # partition_tensor_list: [(np.array([[[1.11, 1.12, 1.13, 1.14]]]), [[0,1],[0,1],[0,4]])]
+        """
+        from .reshard import _compute_concat_info
+
+        if len(partition_tensor_list) == 1:
+            is_complete_data = True
+            for idx, item in enumerate(partition_tensor_list[0][1]):
+                if item[0] != 0 or item[1] != complete_shape[idx]:
+                    is_complete_data = False
+                    break
+            if is_complete_data:
+                return
+
+        if not partition_tensor_list:
+            partition_tensor_list.append((tensor, partition_index))
+        else:
+            i = 0
+            while i < len(partition_tensor_list):
+                concat_axis, first_order, new_partition = _compute_concat_info(
+                    partition_tensor_list[i][1], partition_index)
+                if concat_axis != -1:
+                    if first_order == 0:
+                        new_tensor = np.concatenate(
+                            (partition_tensor_list[i][0], tensor),
+                            axis=concat_axis)
+                    else:
+                        new_tensor = np.concatenate(
+                            (tensor, partition_tensor_list[i][0]),
+                            axis=concat_axis)
+
+                    partition_tensor_list.pop(i)
+                    Converter.merge(partition_tensor_list, new_tensor,
+                                    new_partition, complete_shape)
+                    break
+                i += 1
+
+    @staticmethod
+    def split(complete_tensor, partition_index_list, length):
+        """
+        Slice a complete tensor.
+
+        Returns:
+            sliced_tensor_list(list): sliced tensors with 'partition_index_list'
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                rank = 2
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                sliced_tensor_list = split(complete_tensor, [[], [], [2, 4]], 3)
+                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
+        """
+        sliced_tensor_list = []
+        axis = len(complete_tensor.shape) - length
+        sliced_tensor = np.split(
+            complete_tensor, partition_index_list[axis], axis=axis)
+        if length == 1:
+            return sliced_tensor
+        for tensor in sliced_tensor:
+            sliced_tensor_list.extend(
+                Converter.split(tensor, partition_index_list, length - 1))
+        return sliced_tensor_list
+
+    @staticmethod
+    def _get_split_indices(complete_shape, dims_mapping, process_shape,
+                           process_group):
+        """
+        Get split indices of every dimension.
+
+        Returns:
+            split_indices_list(list): the split indices of every dimension of the tensor
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group)
+                # index: [[], [], [2, 4]]
+        """
+        from .reshard import _compute_partition_index
+
+        split_indices_list = []
+        for process in process_group:
+            partition_index = _compute_partition_index(
+                process, complete_shape, dims_mapping, process_shape,
+                process_group)
+            if split_indices_list:
+                for dim in range(len(partition_index)):
+                    split_indices_list[dim].extend(partition_index[dim])
+            else:
+                split_indices_list = partition_index
+        split_indices_list = list(
+            map(lambda x, y: list(set(x) - set([y]) - set([0])),
+                split_indices_list, complete_shape))
+        split_indices_list = [sorted(x) for x in split_indices_list]
+        return split_indices_list
+
+    @staticmethod
+    def _get_sliced_index(rank_id, complete_shape, dims_mapping, process_shape,
+                          process_group):
+        """
+        Get sliced_tensor's index of current rank in all sliced tensors list.
+
+        Returns:
+            sliced_tensor_index(int): the index of sliced tensor in sliced_tensor_list
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
+                rank = 2
+                complete_shape = [1, 1, 6]
+                dims_mapping = [-1, -1, 0]
+                process_shape = [3]
+                process_group = [0, 1, 2]
+
+                slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3)
+                # slice_tensor: 
+                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
+
+                index = _get_sliced_index(rank, complete_shape, dims_mapping
+                                                process_shape, process_group)
+                # index: 2
+        """
+        from .reshard import _compute_partition_index
+
+        partition_index = _compute_partition_index(
+            rank_id, complete_shape, dims_mapping, process_shape, process_group)
+        sliced_index = 0
+        for i, shape in enumerate(complete_shape):
+            if dims_mapping[i] == -1:
+                slice_shape = shape
+            else:
+                slice_shape = shape // process_shape[dims_mapping[i]]
+            if shape == 1:
+                index = 0
+            else:
+                index = (partition_index[i][0] + 1) // slice_shape
+            sliced_index = sliced_index * (shape // slice_shape) + index
+        return sliced_index
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 8efb9eb719237fd433b9fb02b0772eb6581319e4..56beb8957415d3c3c401fdbf754cb17fc5e253a7 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -99,11 +99,11 @@ class Engine:
             all_ranks = world_process_group.ranks
             for rank in all_ranks:
                 self._parallel(rank)
-        place = _get_device()
-        if isinstance(place, fluid.CUDAPlace):
+        self._place = _get_device()
+        if isinstance(self._place, fluid.CUDAPlace):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
         if self._executor is None:
-            self._executor = fluid.Executor(place)
+            self._executor = paddle.static.Executor(self._place)
 
     def _build(self):
         serial_main_prog = self._serial_main_progs.get(self.mode, None)
@@ -119,12 +119,13 @@ class Engine:
             labels = [s._create_feed_layer() for s in to_list(labels_spec)]
             self._input_vars = inputs
             self._label_vars = labels
-            feed_list = self._input_vars + self._label_vars
+            self._feed_vars = self._input_vars + self._label_vars
             outputs = to_list(self.model(*inputs))
             if self.mode != "predict" and self.loss:
                 loss = self.loss(*(outputs + labels))
                 self._loss_var = loss
 
+        self._fetch_vars = {"outputs": outputs, "loss": loss}
         self._serial_main_progs[self.mode] = serial_main_prog
         self._serial_startup_progs[self.mode] = serial_startup_prog
         self._dist_contexts[self.mode] = DistributedContext(
@@ -278,19 +279,32 @@ class Engine:
         dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
         dist_context = self._dist_contexts[self.mode]
         dist_main_block = dist_main_prog.global_block()
+        serial_main_prog = self._serial_main_progs[self.mode]
+        serial_main_block = serial_main_prog.global_block()
         op_size = len(dist_main_block.ops)
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
             dataloader = NonIterableGeneratorLoader(
                 dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
         new_op_size = len(dist_main_block.ops)
-        for idx in range(new_op_size - 1, op_size - 1, -1):
+        for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(op.desc)
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
+            for in_name in new_op.input_arg_names:
+                if in_name == "lod_tensor_blocking_queue_0":
+                    continue
+                if in_name not in dist_main_block.vars:
+                    in_var = serial_main_block._var_recursive(in_name)
+                    dist_main_block._clone_variable(in_var, in_var.persistable)
+            for out_name in new_op.output_arg_names:
+                if out_name not in dist_main_block.vars:
+                    out_var = serial_main_block._var_recursive(out_name)
+                    dist_main_block._clone_variable(out_var,
+                                                    out_var.persistable)
             dist_op = DistributedOperator(new_op)
             dist_context.add_dist_op_for_program(dist_op)
         for _ in range(new_op_size - op_size):
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 4cc710b226d8f84fadd249a148e754d5330fb564..c6afcfec8a0082bc2a20e88275d5ba428fa3aaf1 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -29,6 +29,7 @@ from .process_group import new_process_group, ProcessGroup, _g_process_group_map
 
 # NOTE: If op in _g_special_ops, it will not be resharded. 
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
+while_block_info = {}
 
 
 class AllGatherOpDesc:
@@ -280,8 +281,20 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(dist_tensor, dist_op, op_input=True):
+def _need_reshard(dist_tensor,
+                  dist_op,
+                  actual_process_mesh,
+                  program,
+                  dist_context,
+                  op_input=True):
     """Judge the tensor whether needs to be resharded."""
+
+    def _is_unshard(dims_mapping):
+        for dim in dims_mapping:
+            if dim != -1:
+                return False
+        return True
+
     is_reshard = False
     tensor_dist_attr = dist_tensor.dist_attr
     tensor_name = dist_tensor.serial_tensor.name
@@ -289,32 +302,74 @@ def _need_reshard(dist_tensor, dist_op, op_input=True):
     tensor_process_mesh = tensor_dist_attr.process_mesh
     op_dist_attr = dist_op.dist_attr
     op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-    op_process_mesh = op_dist_attr.process_mesh
+    op_process_mesh = actual_process_mesh
     if op_input:
         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_input_dims_mapping, op_process_mesh
                 ])):
-            if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
-                is_reshard = True
+            # dims_mapping
+            if tensor_dims_mapping != op_input_dims_mapping:
+                if dist_op.serial_op.type == "while":
+                    sub_block = program.blocks[dist_op.serial_op.attr(
+                        "sub_block").id]
+                    for op in sub_block.ops:
+                        for var_name in op.input_arg_names:
+                            if var_name == tensor_name:
+                                dist_op_attr = dist_context.get_dist_op_for_program(
+                                    op).dist_attr
+                                var_dims_mapping = dist_op_attr.get_input_dims_mapping(
+                                    var_name)
+                                if var_dims_mapping != tensor_dims_mapping:
+                                    is_reshard = True
+                                    break
+                else:
+                    is_reshard = True
+            # process_mesh
+            if tensor_process_mesh != op_process_mesh:
+                # when processes length is not the same, the dims mapping must be replicative now
+                if len(tensor_process_mesh.processes) != len(
+                        op_process_mesh.processes):
+                    assert _is_unshard(tensor_dims_mapping)
+                    assert _is_unshard(op_input_dims_mapping)
+                else:
+                    if dist_tensor.serial_tensor.dtype == paddle.bool:
+                        raise ValueError("Bool var is not supported reshard.")
+
+                    # for while op, it should find the process mesh of op actually used the tensor as input
+                    if dist_op.serial_op.type == "while":
+                        sub_block = program.blocks[dist_op.serial_op.attr(
+                            "sub_block").id]
+                        for op in sub_block.ops:
+                            for var_name in op.input_arg_names:
+                                if var_name == tensor_name:
+                                    dist_op_attr = dist_context.get_dist_op_for_program(
+                                        op).dist_attr
+                                    process_mesh = dist_op_attr.process_mesh
+                                    if process_mesh == op_process_mesh:
+                                        is_reshard = True
+                                        break
+                    else:
+                        is_reshard = True
     else:
         op_output_dims_mapping = op_dist_attr.get_output_dims_mapping(
             tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_output_dims_mapping, op_process_mesh
                 ])):
             if tensor_process_mesh != op_process_mesh:
+                if dist_tensor.serial_tensor.dtype == paddle.bool:
+                    raise ValueError("Bool var is not supported reshard.")
                 is_reshard = True
             if tensor_dims_mapping != op_output_dims_mapping:
                 raise ValueError(
                     "It is not supported that tensor dims mapping is different from op output dims mapping."
                 )
+
     return is_reshard
 
 
@@ -329,13 +384,14 @@ def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
     return complete_shape
 
 
-def find_op_desc_seq(dist_tensor, dist_op):
+def find_op_desc_seq(dist_tensor, dist_op, actual_process_mesh, batch_size):
     """
     Find the op description sequence to reshard the source tensor for matching the op requirement.
 
     Args:
         dist_tensor (DistributedTensor): A distributed tensor.
         dist_op (DistributedOperator): A distributed operator.
+        actual_process_mesh (ProcessMesh): The actual op process mesh.
 
     Returns:
         Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
@@ -350,11 +406,16 @@ def find_op_desc_seq(dist_tensor, dist_op):
     source_process_shape = source_process_mesh.topology
 
     op_dist_attr = dist_op.dist_attr
-    target_process_mesh = op_dist_attr.process_mesh
+    target_process_mesh = actual_process_mesh
     target_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
     target_process_group = target_process_mesh.processes
     target_process_shape = target_process_mesh.topology
 
+    if source_tensor.shape[0] < 0:
+        new_shape = list(source_tensor.shape)
+        new_shape[0] = batch_size
+        source_tensor.desc.set_shape(new_shape)
+
     complete_shape = _compute_complete_shape(
         source_tensor.shape, source_process_shape, source_dims_mapping)
     op_desc_seq = {}
@@ -503,7 +564,7 @@ def find_op_desc_seq(dist_tensor, dist_op):
     return op_desc_seq
 
 
-def _insert_send_op(block, idx, tensor, dst):
+def _insert_send_op(block, idx, tensor, dst, op_role):
     """Insert send op into block at the given index."""
     op_type = 'send_v2'
     block._insert_op(
@@ -514,10 +575,11 @@ def _insert_send_op(block, idx, tensor, dst):
             'ring_id': 0,
             'peer': dst,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_recv_op(block, idx, tensor, src):
+def _insert_recv_op(block, idx, tensor, src, op_role):
     """Insert recv op into block at the given index."""
     op_type = 'recv_v2'
     block._insert_op(
@@ -531,14 +593,16 @@ def _insert_recv_op(block, idx, tensor, src):
             'out_shape': tensor.shape,
             'dtype': tensor.dtype,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_concat_op(block, idx, tensors, axis):
+def _insert_concat_op(block, idx, tensors, axis, op_role):
     """Insert concat op into block at the given block."""
     inputs = {'X': tensors}
     attrs = {}
     attrs['axis'] = axis
+    attrs['op_role'] = op_role
     helper = LayerHelper('concat', **locals())
     with paddle.static.program_guard(block.program):
         out = helper.create_variable_for_type_inference(
@@ -548,7 +612,8 @@ def _insert_concat_op(block, idx, tensors, axis):
     return out
 
 
-def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
+def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
+                     op_role):
     """Insert slice op into block at the given block."""
     inputs = {'Input': tensor}
     infer_flags = list(1 for i in range(len(axes)))
@@ -556,24 +621,23 @@ def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
         "axes": axes,
         "starts": starts,
         "ends": ends,
-        "infer_flags": infer_flags
+        "infer_flags": infer_flags,
+        'op_role': op_role
     }
     helper = LayerHelper('slice', **locals())
     out = block.create_var(
-        name=new_var_name,
-        dtype=tensor.dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR)
+        name=new_var_name, dtype=tensor.dtype, type=tensor.type)
     block._insert_op(
         idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
     return out
 
 
-def _insert_split_op(block, idx, tensor, num_or_sections):
+def _insert_split_op(block, idx, tensor, num_or_sections, op_role):
     """Insert split op into block at the given index."""
     helper = LayerHelper('split', **locals())
     input_shape = tensor.shape
     inputs = {'X': tensor}
-    attrs = {'num': num_or_sections, "axis": 0}
+    attrs = {'num': num_or_sections, 'axis': 0, 'op_role': op_role}
     with paddle.static.program_guard(block.program):
         outs = [
             helper.create_variable_for_type_inference(
@@ -584,7 +648,7 @@ def _insert_split_op(block, idx, tensor, num_or_sections):
     return outs
 
 
-def _insert_allgather_op(block, idx, tensor, ranks):
+def _insert_allgather_op(block, idx, tensor, ranks, op_role):
     """Insert allgather op into block at the given index."""
 
     def _insert_fill_constant_op(block, idx):
@@ -597,6 +661,7 @@ def _insert_allgather_op(block, idx, tensor, ranks):
         attrs['str_value'] = str(int("1"))
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
+        attrs['op_role'] = op_role
         utils.get_shape_tensor_inputs(
             inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
         block._insert_op(
@@ -625,14 +690,16 @@ def _insert_allgather_op(block, idx, tensor, ranks):
             inputs={'X': [fill_constant_out]},
             outputs={'Out': [fill_constant_out]},
             attrs={'ring_id': 0,
-                   'use_calc_stream': True})
+                   'use_calc_stream': True,
+                   'op_role': op_role})
 
         # insert c_sync_calc_stream op
         block._insert_op(
             idx + 2,
             type="c_sync_calc_stream",
             inputs={'X': [fill_constant_out]},
-            outputs={'Out': [fill_constant_out]})
+            outputs={'Out': [fill_constant_out]},
+            attrs={'op_role': op_role})
         idx_offset = 3
 
     # insert c_allgather op
@@ -649,20 +716,21 @@ def _insert_allgather_op(block, idx, tensor, ranks):
         attrs={
             'ring_id': group.id,
             'use_calc_stream': True,
-            'nranks': group.nranks
+            'nranks': group.nranks,
+            'op_role': op_role
         })
     idx_offset += 1
 
     # insert split op
     split_out = _insert_split_op(block, idx + idx_offset, allgather_out,
-                                 group.nranks)
+                                 group.nranks, op_role)
     idx_offset += 1
     tensor_list.extend(split_out)
     return tensor_list, idx_offset
 
 
 def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
-                               block, idx):
+                               block, idx, op_role):
     """Concat the tensors and insert concat op."""
     if not partition_tensor_list:
         partition_tensor_list.append((tensor, partition_index))
@@ -674,13 +742,13 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
                 partition_tensor_list[i][1], partition_index)
             if concat_axis != -1:
                 has_concat = True
-                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \
+                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis, op_role) \
                     if first_order == 0 else \
-                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis)
+                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis, op_role)
                 partition_tensor_list.pop(i)
                 idx[0] += 1
                 _concat_partitions_with_op(partition_tensor_list, _,
-                                           new_partition, block, idx)
+                                           new_partition, block, idx, op_role)
                 break
             i += 1
         if not has_concat:
@@ -692,8 +760,47 @@ HAS_RECV = {}
 HAS_ALLGATHER = {}
 
 
-def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
-                  dist_context):
+def _get_while_op_actual_process_mesh(op, program, rank_id, dist_context):
+    """Get the while op actual Process mesh corresponding to rank"""
+    assert op.type == "while"
+    while_op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    actual_process_mesh = None
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh == while_op_process_mesh:
+            continue
+        if rank_id in process_mesh.processes:
+            raw_process_mesh = process_mesh
+            break
+
+    if actual_process_mesh is None and rank_id in while_op_process_mesh.processes:
+        actual_process_mesh = while_op_process_mesh
+
+    assert actual_process_mesh is not None
+    return actual_process_mesh
+
+
+def _get_var(var_name, block, program):
+    """Get var in the parent block if not found in the current block"""
+    var = None
+    if var_name in block.vars:
+        var = block.vars[var_name]
+    else:
+        parent_block = program.blocks[block.parent_idx]
+        if var_name in parent_block.vars:
+            var = parent_block.vars[var_name]
+    assert var is not None
+    return var
+
+
+def parse_op_desc(block, rank_id, op_desc_seq, var_name, reshard_op,
+                  dist_context, program, actual_process_mesh):
     """Parse op desc sequence and insert op in the block"""
     global HAS_SENT
     global HAS_RECV
@@ -703,9 +810,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
     if rank_id not in op_desc_seq.keys():
         return
     op_desc_list = op_desc_seq[rank_id]
-    block = program.global_block()
-    assert var_name in block.vars.keys(
-    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)
 
     idx = None
     for index, op in list(enumerate(block.ops)):
@@ -716,7 +820,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
         rank_id)
 
     matched_op = block.ops[idx]
-    source_tensor = block.vars[var_name]
+    source_tensor = _get_var(var_name, block, program)
     for op_desc in op_desc_list:
         if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
             if var_name not in HAS_ALLGATHER.keys():
@@ -724,7 +828,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
                     map(lambda x: x[0], HAS_ALLGATHER[var_name])):
                 tensor_list, idx_offset = _insert_allgather_op(
-                    block, idx, source_tensor, op_desc.group)
+                    block, idx, source_tensor, op_desc.group,
+                    reshard_op.attr('op_role'))
                 idx += idx_offset
                 tensor_name_list = [var.name for var in tensor_list]
                 HAS_ALLGATHER[var_name].append(
@@ -743,7 +848,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if var_name not in HAS_SENT.keys():
                 HAS_SENT[var_name] = []
             if op_desc.dst not in HAS_SENT[var_name]:
-                _insert_send_op(block, idx, source_tensor, op_desc.dst)
+                _insert_send_op(block, idx, source_tensor, op_desc.dst,
+                                reshard_op.attr('op_role'))
                 idx += 1
                 HAS_SENT[var_name].append(op_desc.dst)
 
@@ -758,8 +864,10 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 recv_tensor = block.create_var(
                     name=unique_name.generate(var_name + "@recv"),
                     shape=shape,
-                    dtype=source_tensor.dtype)
-                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
+                    dtype=source_tensor.dtype,
+                    type=source_tensor.type)
+                _insert_recv_op(block, idx, recv_tensor, op_desc.src,
+                                reshard_op.attr('op_role'))
                 tensor_list.append(recv_tensor)
                 idx += 1
                 HAS_RECV[var_name][op_desc.src] = recv_tensor
@@ -772,7 +880,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             for index, tensor in enumerate(tensor_list):
                 _concat_partitions_with_op(partition_tensor_list, tensor,
                                            partition_index_list[index], block,
-                                           idx_list)
+                                           idx_list, reshard_op.attr('op_role'))
             idx = idx_list[0]
 
         elif isinstance(op_desc, SliceOpDesc):
@@ -787,11 +895,11 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 starts=op_desc.starts,
                 ends=op_desc.ends,
                 axes=op_desc.axes,
-                new_var_name=new_name)
+                new_var_name=new_name,
+                op_role=reshard_op.attr('op_role'))
 
             tensor_attr = TensorDistributedAttribute()
-            process_mesh = dist_context.get_op_dist_attr_for_program(
-                matched_op).process_mesh
+            process_mesh = actual_process_mesh
             dims_mapping = dist_context.get_op_dist_attr_for_program(
                 matched_op).get_input_dims_mapping(var_name)
             tensor_attr.dims_mapping = dims_mapping
@@ -799,11 +907,29 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             dist_context.set_tensor_dist_attr_for_program(target_tensor,
                                                           tensor_attr)
 
+            if op.type == "while":
+                global while_block_info
+                # var_reshard_mapping means the while op input need be changed to 
+                if "var_reshard_mapping" not in while_block_info[op.attr(
+                        "sub_block").id].keys():
+                    while_block_info[op.attr("sub_block").id][
+                        "var_reshard_mapping"] = {}
+                while_block_info[op.attr("sub_block").id][
+                    "var_reshard_mapping"][var_name] = target_tensor.name
+
             # rename op input name according to new name
             for op in block.ops:
                 for name in op.input_arg_names:
                     op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                     if name == var_name and op_dist_attr is not None:
+                        if op.desc.id() == matched_op.desc.id():
+                            op.desc._rename_input(name, target_tensor.name)
+                            op_dist_attr.set_input_dims_mapping(
+                                target_tensor.name, dims_mapping)
+                            op_dist_attr.set_input_dist_attr(name, None)
+                            continue
+
+                        # NOTE: For op whose process mesh is a union, its input will not be renamed by other op reshard result now which means that it will have more reshard operation.
                         op_process_mesh = op_dist_attr.process_mesh
                         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                             var_name)
@@ -819,102 +945,166 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
     not_remove_op_ref = [
         "create_py_reader", "create_double_buffer_reader", "read"
     ]
-    remove_op_idx = []
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    for idx, op in enumerate(ops):
-        # handle read op in the pipeline scene specially, it will be removed in the future.
-        if op.type == "read":
-            dim_list = []
-            for var_name in op.output_arg_names:
-                dim_list.extend(vars[var_name].shape)
-            for i in range(idx, -1, -1):
-                if ops[i].type == "create_py_reader":
-                    ops[i]._set_attr("shape_concat", dim_list)
-                    break
-            continue
-
-        # replace the input and output of c_sync_comm_stream op when in pipeline scene.
-        if op.type == "c_sync_comm_stream":
-            need_save = []
-            for var_name in op.input_arg_names:
-                process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                    vars[var_name]).process_mesh
-                if rank_id in process_mesh.processes:
-                    need_save.append(var_name)
-            if not need_save:
-                remove_op_idx.append(idx)
+    global while_block_info
+
+    # NOTE: The nested sub block is not be supported now.
+    remove_block_order = []
+    for block_idx in while_block_info:
+        remove_block_order.append(block_idx)
+
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx not in remove_block_order:
+            remove_block_order.append(block_idx)
+
+    # the sub block should be removed first
+    for block_idx in remove_block_order:
+        remove_op_idx = []
+        block = auto_parallel_main_prog.blocks[block_idx]
+        ops = block.ops
+        vars = block.vars
+        for idx, op in enumerate(ops):
+            if op.type == "read":
+                dim_list = []
+                for var_name in op.output_arg_names:
+                    dim_list.extend(
+                        _get_var(var_name, block, auto_parallel_main_prog)
+                        .shape)
+                for i in range(idx, -1, -1):
+                    if ops[i].type == "create_py_reader":
+                        ops[i]._set_attr("shape_concat", dim_list)
+                        break
                 continue
 
-            proto = OpProtoHolder.instance().get_op_proto(op.type)
-            op.desc.set_input(proto.inputs[0].name, need_save)
-            op.desc.set_output(proto.outputs[0].name, need_save)
-            continue
+            # replace the input and output of c_sync_comm_stream op when in pipeline scene.
+            if op.type == "c_sync_comm_stream":
+                need_save = []
+                for var_name in op.input_arg_names:
+                    process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                        _get_var(var_name, block,
+                                 auto_parallel_main_prog)).process_mesh
+                    if rank_id in process_mesh.processes:
+                        need_save.append(var_name)
+                if not need_save:
+                    remove_op_idx.append(idx)
+                    continue
 
-        # judge the other op whether should be removed.
-        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-        if op_dist_attr is not None:
-            op_process_mesh = op_dist_attr.process_mesh
-            if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
-                remove_op_idx.append(idx)
+                proto = OpProtoHolder.instance().get_op_proto(op.type)
+                op.desc.set_input(proto.inputs[0].name, need_save)
+                op.desc.set_output(proto.outputs[0].name, need_save)
+                continue
 
-    for idx in remove_op_idx[::-1]:
-        block._remove_op(idx)
+            # judge the other op whether should be removed.
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            if op_dist_attr is not None:
+                op_process_mesh = op_dist_attr.process_mesh
+                if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
+                    remove_op_idx.append(idx)
+
+        for idx in remove_op_idx[::-1]:
+            block._remove_op(idx)
 
 
 def _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads):
     """Remove no need vars in the main program"""
-    remove_vars = set()
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    need_vars = set()
-    for op in ops:
-        for var_name in op.input_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-        for var_name in op.output_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-    for var in vars:
-        if var not in need_vars:
-            remove_vars.add(var)
-
-    # change dist_params_grads
-    param_grad_map = {}
-    for op in ops:
-        if int(op.attr('op_role')) == int(OpRole.Optimize):
-            if "Param" in op.input_names and "Grad" in op.input_names:
-                param_name = op.input("Param")[0]
-                grad_name = op.input("Grad")[0]
-                param_grad_map[param_name] = grad_name
-
-    need_remove_idx = []
-    for idx, item in enumerate(dist_params_grads):
-        if item[0].name not in param_grad_map.keys():
-            need_remove_idx.append(idx)
-
-    for idx in need_remove_idx[::-1]:
-        dist_params_grads.pop(idx)
-
-    idx = 0
-    while idx < len(dist_params_grads):
-        param_name = dist_params_grads[idx][0].name
-        grad_name = dist_params_grads[idx][1].name
-        if grad_name != param_grad_map[param_name]:
-            dist_params_grads[idx] = (vars[param_name],
-                                      vars[param_grad_map[param_name]])
-        idx += 1
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        remove_vars = set()
+        ops = block.ops
+        vars = block.vars
+        need_vars = set()
+        for op in ops:
+            for var_name in op.input_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+            for var_name in op.output_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+        for var in vars:
+            if var not in need_vars:
+                remove_vars.add(var)
+
+        # change dist_params_grads, the optimize op just in block 0.
+        if block_idx == 0:
+            param_grad_map = {}
+            for op in ops:
+                if int(op.attr('op_role')) == int(OpRole.Optimize):
+                    if "Param" in op.input_names and "Grad" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        grad_name = op.input("Grad")[0]
+                        param_grad_map[param_name] = grad_name
+
+            need_remove_idx = []
+            for idx, item in enumerate(dist_params_grads):
+                if item[0].name not in param_grad_map.keys():
+                    need_remove_idx.append(idx)
+
+            for idx in need_remove_idx[::-1]:
+                dist_params_grads.pop(idx)
+
+            idx = 0
+            while idx < len(dist_params_grads):
+                param_name = dist_params_grads[idx][0].name
+                grad_name = dist_params_grads[idx][1].name
+                if grad_name != param_grad_map[param_name]:
+                    dist_params_grads[idx] = (vars[param_name],
+                                              vars[param_grad_map[param_name]])
+                idx += 1
 
-    for var in remove_vars:
-        block._remove_var(var)
+        for var in remove_vars:
+            block._remove_var(var)
+
+
+def _change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
+    """Change while op input and output after the corresponding sub block ops removed"""
+    global while_block_info
+    for sub_block_idx in while_block_info:
+        sub_block = auto_parallel_main_prog.blocks[sub_block_idx]
+        parent_while_op_id = while_block_info[sub_block_idx]["op_id"]
+        parent_block = auto_parallel_main_prog.blocks[sub_block.parent_idx]
+
+        sub_block_op_inputs = set()
+        sub_block_op_outputs = []
+        for op in sub_block.ops:
+            # skip the input and output of operators inserted in the reshard phase
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op:
+                for var_name in op.output_arg_names:
+                    if var_name not in sub_block_op_outputs:
+                        sub_block_op_outputs.append(var_name)
+                for var_name in op.input_arg_names:
+                    sub_block_op_inputs.add(var_name)
+
+        # find the while op
+        while_op = None
+        for op in parent_block.ops:
+            if op.desc.id() == parent_while_op_id and op.type == "while":
+                while_op = op
+                break
+
+        assert while_op is not None
+
+        # find the actual input and output of while op
+        proto = OpProtoHolder.instance().get_op_proto(while_op.type)
+        new_X = []
+        for var_name in while_op.input("X"):
+            if var_name in sub_block_op_inputs:
+                new_X.append(var_name)
+        assert new_X
+        while_op.desc.set_input(proto.inputs[0].name, new_X)
+
+        new_Out = []
+        for var_name in while_op.output("Out"):
+            for output_name in sub_block_op_outputs[::-1]:
+                if output_name.find(var_name) != -1:
+                    new_Out.append(output_name)
+        assert new_Out
+        while_op.desc.set_output(proto.outputs[0].name, new_Out)
 
 
 def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
                            dist_params_grads):
     """Remove no need vars and ops in the main program."""
     _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id)
+    _change_while_op_input_and_output(auto_parallel_main_prog, dist_context)
     _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads)
 
 
@@ -992,8 +1182,70 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
         startup_block._remove_op(idx)
 
 
-def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
-            dist_context, dist_params_grads):
+def _get_process_meshes(op, program, dist_context):
+    """Get all process meshes when op has sub block."""
+    assert op.has_attr("sub_block")
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    process_meshes = []
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh not in process_meshes and process_mesh != op_process_mesh:
+            process_meshes.append(process_mesh)
+
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def _is_condition_replicative(op, program, dist_context):
+    assert op.type == "while"
+    sub_block = program.blocks[op.attr("sub_block").id]
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_dist_attr = dist_op.dist_attr
+
+    # the dims mapping of condition tensor should be replicative
+    for var_name in op.input("Condition"):
+        var = _get_var(var_name, sub_block, program)
+        dist_tensor = dist_context.get_dist_tensor_for_program(var)
+        tensor_dist_attr = dist_tensor.dist_attr
+        var_dims_mapping = tensor_dist_attr.dims_mapping
+        for dim in var_dims_mapping:
+            if dim != -1:
+                return False
+
+    return True
+
+
+def _get_op_process_meshes(op, dist_context):
+    process_meshes = []
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_process_mesh = dist_op.dist_attr.process_mesh
+    for process_mesh in dist_context.process_meshes:
+        if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+        ) and len(process_mesh.processes) <= len(op_process_mesh.processes):
+            process_meshes.append(process_mesh)
+
+    # it means the process mesh is not a union when process meshes is null
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def reshard(auto_parallel_main_prog,
+            auto_parallel_startup_prog,
+            rank_id,
+            dist_context,
+            dist_params_grads,
+            batch_size=None):
     """
     Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute.
 
@@ -1019,65 +1271,137 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
             return True
         return False
 
-    block = auto_parallel_main_prog.global_block()
-    idx = 0
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
+    global while_block_info
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx in while_block_info:
+            if "var_reshard_mapping" in while_block_info[block_idx]:
+                var_reshard_mapping = while_block_info[block_idx][
+                    "var_reshard_mapping"]
+                for op in block.ops:
+                    for var_name in op.input_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_input(var_name,
+                                                  var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_input_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_input_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_input_dist_attr(var_name, None)
+
+                    # the outputs also need to be renamed when the output name is the same with input name
+                    for var_name in op.output_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_output(
+                                var_name, var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_output_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_output_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_output_dist_attr(var_name,
+                                                                  None)
+
+        idx = 0
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+
+            if _is_special_op(op):
+                idx += 1
+                continue
 
-        if _is_special_op(op):
-            idx += 1
-            continue
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None:
+                process_meshes = []
+                if op.type == "while":
+                    if not _is_condition_replicative(
+                            op, auto_parallel_main_prog, dist_context):
+                        raise ValueError(
+                            "Please check the condition due to the dims mapping is not replicative."
+                        )
+                    process_meshes = _get_process_meshes(
+                        op, auto_parallel_main_prog, dist_context)
+                    assert process_meshes
+                    if op.attr("sub_block").id not in while_block_info:
+                        while_block_info[op.attr("sub_block").id] = {}
+                    while_block_info[op.attr("sub_block").id][
+                        "op_id"] = op.desc.id()
+                    while_block_info[op.attr("sub_block").id][
+                        "actual_process_mesh"] = _get_while_op_actual_process_mesh(
+                            op, auto_parallel_main_prog, rank_id, dist_context)
+                else:
+                    process_meshes = _get_op_process_meshes(op, dist_context)
+                input_vars = None
+                if op.type == "while":
+                    input_var_names = op.input("X")
+                else:
+                    input_var_names = op.input_arg_names
+                idx_offset = 0
+                for var_name in op.input_arg_names:
+                    # skip lod_tensor_blocking_queue_0
+                    if var_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    for process_mesh in process_meshes:
+                        if dist_tensor is not None and _need_reshard(
+                                dist_tensor, dist_op, process_mesh,
+                                auto_parallel_main_prog, dist_context):
+                            reshard_op_desc = find_op_desc_seq(
+                                dist_tensor, dist_op, process_mesh, batch_size)
+                            parse_op_desc(block, rank_id, reshard_op_desc,
+                                          var_name, op, dist_context,
+                                          auto_parallel_main_prog, process_mesh)
+                            cur_op_count = len(block.ops)
+                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None:
-            idx_offset = 0
-            for var_name in op.input_arg_names:
-                # skip lod_tensor_blocking_queue_0
-                if var_name == "lod_tensor_blocking_queue_0":
-                    continue
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op):
-                    reshard_op_desc = find_op_desc_seq(dist_tensor, dist_op)
-                    parse_op_desc(auto_parallel_main_prog, rank_id,
-                                  reshard_op_desc, var_name, op, dist_context)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
-
-    # insert send and recv op if output process mesh is different from tensor process mesh
-    idx = 0
-    skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"]
-    skip_ops += _g_special_ops
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None and op.type not in skip_ops:
-            for var_name in op.output_arg_names:
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op, False):
-                    for index, item in enumerate(
-                            dist_op.dist_attr.process_mesh.processes):
-                        recv_rank = dist_tensor.dist_attr.process_mesh.processes[
-                            index]
-                        if rank_id == item:
-                            _insert_send_op(block, idx + 1, var, recv_rank)
-                        if rank_id == recv_rank:
-                            _insert_recv_op(block, idx + 1, var, item)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
+        # insert send and recv op if output process mesh is different from tensor process mesh
+        idx = 0
+        # skip reader and ops whose process mesh is union
+        skip_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read", "while",
+            "write_to_array", "read_from_array"
+        ]
+        skip_ops += _g_special_ops
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None and op.type not in skip_ops:
+                for var_name in op.output_arg_names:
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    process_mesh = dist_op.dist_attr.process_mesh
+                    if dist_tensor is not None and _need_reshard(
+                            dist_tensor, dist_op, process_mesh,
+                            auto_parallel_main_prog, dist_context, False):
+                        for index, item in enumerate(
+                                dist_op.dist_attr.process_mesh.processes):
+                            recv_rank = dist_tensor.dist_attr.process_mesh.processes[
+                                index]
+                            if rank_id == item:
+                                _insert_send_op(block, idx + 1, var, recv_rank,
+                                                op.attr('op_role'))
+                            if rank_id == recv_rank:
+                                _insert_recv_op(block, idx + 1, var, item,
+                                                op.attr('op_role'))
+                        cur_op_count = len(block.ops)
+                        idx_offset = idx_offset + cur_op_count - pre_op_count
+                        pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
     # remove no need vars and ops in the main program
     remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/tuner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..513558501a0eb218b772a8c02142d3c320675710
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py
new file mode 100644
index 0000000000000000000000000000000000000000..d61e53a02724088c89f2e8cfafc91ca0047aa967
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/storable.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+
+class Storable(object):
+    def get_state(self):
+        raise NotImplementedError
+
+    def set_state(self, state):
+        raise NotImplementedError
+
+    def save(self, path):
+        state = self.get_state()
+        state_json = json.dumps(state)
+        with open(path, "w") as f:
+            f.write(state_json)
+        return str(path)
+
+    def load(self, path):
+        with open(path, "r") as f:
+            state_data = f.read()
+        state = json.loads(state_data)
+        self.set_state(state)
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63364c5b75ef03a81d8b293515f3bc5a55fce78
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import contextlib
+import copy
+import math
+import random
+import numpy as np
+
+from .tunable_variable import Boolean
+from .tunable_variable import Fixed
+from .tunable_variable import Choice
+from .tunable_variable import IntRange
+from .tunable_variable import FloatRange
+
+
+class TunableSpace(object):
+    """
+    A TunableSpace is constructed by the tunable variables.
+    """
+
+    def __init__(self):
+        # Tunable variables for this tunable variables
+        self._variables = {}
+        # Specific values coresponding to each tunable variable
+        self._values = {}
+
+    @property
+    def variables(self):
+        return self._variables
+
+    @property
+    def values(self):
+        return self._values
+
+    def get_value(self, name):
+        if name in self.values:
+            return self.values[name]
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def set_value(self, name, value):
+        if name in self.values:
+            self.values[name] = value
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def _exists(self, name):
+        if name in self._variables:
+            return True
+        return False
+
+    def _retrieve(self, tv):
+        tv = tv.__class__.from_state(tv.get_state())
+        if self._exists(tv.name):
+            return self.get_value(tv.name)
+        return self._register(tv)
+
+    def _register(self, tv):
+        self._variables[tv.name] = tv
+        if tv.name not in self.values:
+            self.values[tv.name] = tv.default
+        return self.values[tv.name]
+
+    def __getitem__(self, name):
+        return self.get_value(name)
+
+    def __setitem__(self, name, value):
+        self.set_value(name, value)
+
+    def __contains__(self, name):
+        try:
+            self.get_value(name)
+            return True
+        except (KeyError, ValueError):
+            return False
+
+    def fixed(self, name, default):
+        tv = Fixed(name=name, default=default)
+        return self._retrieve(tv)
+
+    def boolean(self, name, default=False):
+        tv = Boolean(name=name, default=default)
+        return self._retrieve(tv)
+
+    def choice(self, name, values, default=None):
+        tv = Choice(name=name, values=values, default=default)
+        return self._retrieve(tv)
+
+    def int_range(self, name, start, stop, step=1, default=None):
+        tv = IntRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def float_range(self, name, start, stop, step=None, default=None):
+        tv = FloatRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def get_state(self):
+        return {
+            "variables": [{
+                "class_name": v.__class__.__name__,
+                "state": v.get_state()
+            } for v in self._variables.values()],
+            "values": dict((k, v) for (k, v) in self.values.items())
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        ts = cls()
+        for v in state["variables"]:
+            v = _deserialize_tunable_variable(v)
+            ts._variables[v.name] = v
+        ts._values = dict((k, v) for (k, v) in state["values"].items())
+        return ts
+
+
+def _deserialize_tunable_variable(state):
+    classes = (Boolean, Fixed, Choice, IntRange, FloatRange)
+    cls_name_to_cls = {cls.__name__: cls for cls in classes}
+
+    if isinstance(state, classes):
+        return state
+
+    if (not isinstance(state, dict) or "class_name" not in state or
+            "state" not in state):
+        raise ValueError(
+            "Expect state to be a python dict containing class_name and state as keys, but found {}"
+            .format(state))
+
+    cls_name = state["class_name"]
+    cls = cls_name_to_cls[cls_name]
+    if cls is None:
+        raise ValueError("Unknown class name {}".format(cls_name))
+
+    cls_state = state["state"]
+    deserialized_object = cls.from_state(cls_state)
+    return deserialized_object
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..9549b44c48ecb0b04ac22fafa6dcf5b6ff9aa0ae
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -0,0 +1,242 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class TunableVariable(object):
+    """
+    Tunablevariable base class.
+    """
+
+    def __init__(self, name, default=None):
+        self.name = name
+        self._default = default
+
+    @property
+    def default(self):
+        return self._default
+
+    def get_state(self):
+        return {"name": self.name, "default": self.default}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+
+class Fixed(TunableVariable):
+    """
+    Fixed variable which cannot be changed.
+    """
+
+    def __init__(self, name, default):
+        super(Fixed, self).__init__(name=name, default=default)
+        self.name = name
+        if not isinstance(default, (str, int, float, bool)):
+            raise ValueError(
+                "Fixed must be an str, int, float or bool, but found {}"
+                .format(default))
+        self._default = default
+
+    def random(self, seed=None):
+        return self._default
+
+    def __repr__(self):
+        return "Fixed(name: {}, value: {})".format(self.name, self.default)
+
+
+class Boolean(TunableVariable):
+    """
+    Choice between True and False.
+    """
+
+    def __init__(self, name, default=False):
+        super(Boolean, self).__init__(name=name, default=default)
+        if default not in {True, False}:
+            raise ValueError(
+                "default must be a Python boolean, but got {}".format(default))
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice((True, False))
+
+    def __repr__(self):
+        return 'Boolean(name: "{}", default: {})'.format(self.name,
+                                                         self.default)
+
+
+class Choice(TunableVariable):
+    def __init__(self, name, values, default=None):
+        super(Choice, self).__init__(name=name, default=default)
+
+        types = set(type(v) for v in values)
+        if len(types) > 1:
+            raise TypeError(
+                "Choice can contain only one type of value, but found values: {} with types: {}."
+                .format(str(values), str(types)))
+
+        if isinstance(values[0], str):
+            values = [str(v) for v in values]
+            if default is not None:
+                default = str(default)
+        elif isinstance(values[0], int):
+            values = [int(v) for v in values]
+            if default is not None:
+                default = int(default)
+        elif isinstance(values[0], float):
+            values = [float(v) for v in values]
+            if default is not None:
+                default = float(default)
+        elif isinstance(values[0], bool):
+            values = [bool(v) for v in values]
+            if default is not None:
+                default = bool(default)
+        else:
+            raise TypeError(
+                "Choice can only contain str, int, float, or boll, but found: {} "
+                .format(str(values)))
+        self.values = values
+
+        if default is not None and default not in values:
+            raise ValueError(
+                "The default value should be one of the choices {}, but found {}".
+                format(values, default))
+        self._default = default
+
+    @property
+    def default(self):
+        if self._default is None:
+            if None in self.values:
+                return None
+            return self.values[0]
+        return self._default
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice(self.values)
+
+    def get_state(self):
+        state = super(Choice, self).get_state()
+        state["values"] = self.values
+        return state
+
+    def __repr__(self):
+        return 'Choice(name: "{}", values: {}, default: {})'.format(
+            self.name, self.values, self.default)
+
+
+class IntRange(TunableVariable):
+    """
+    Integer range.
+    """
+
+    def __init__(self, name, start, stop, step=1, default=None, endpoint=False):
+        super(IntRange, self).__init__(name=name, default=default)
+        self.start = self._check_int(start)
+        self.stop = self._check_int(stop)
+        self.step = self._check_int(step)
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return int(value)
+
+    def get_state(self):
+        state = super(IntRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["default"] = self._default
+        return state
+
+    def _check_int(self, val):
+        int_val = int(val)
+        if int_val != val:
+            raise ValueError("Expects val is an int, but found: {}.".format(
+                str(val)))
+        return int_val
+
+    def __repr__(self):
+        return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format(
+            self.name, self.start, self.stop, self.step, self.default)
+
+
+class FloatRange(TunableVariable):
+    """
+    Float range.
+    """
+
+    def __init__(self,
+                 name,
+                 start,
+                 stop,
+                 step=None,
+                 default=None,
+                 endpoint=False):
+        super(FloatRange, self).__init__(name=name, default=default)
+        self.stop = float(stop)
+        self.start = float(start)
+        if step is not None:
+            self.step = float(step)
+        else:
+            self.step = None
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return value
+
+    def get_state(self):
+        state = super(FloatRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["endpoint"] = self.endpoint
+        return state
+
+    def __repr__(self):
+        return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format(
+            self.name, self.start, self.stop, self.step, self.default,
+            self.endpoint)
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 75e0ae251ef05e379dd1c4c330ec688e8ab6df7a..241eadcbace22cf36504e2c0ed36566fa94b9e4b 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -22,7 +22,6 @@ import logging
 from functools import reduce
 
 import paddle.fluid.core as core
-from paddle.framework.io import _to_LodTensor
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.fluid.io import is_parameter, is_belong_to_optimizer
 from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
@@ -739,7 +738,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             rank_id = paddle.distributed.get_rank()
             index = cur_attr["process_group"].index(rank_id)
             param = dist_param_dict[var_name][index]
-            dist_param_dict[var_name] = _to_LodTensor(param)
+            dist_param_dict[var_name] = param
             continue
 
         pre_param = dist_param_dict[var_name]
@@ -751,7 +750,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             dist_param_dict[var_name] = complete_param
         else:
             complete_param = pre_param[0]
-            dist_param_dict[var_name] = _to_LodTensor(complete_param)
+            dist_param_dict[var_name] = complete_param
 
         if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
             sliced_param = _slice_parameter_with_dist_attr(complete_param,
@@ -798,7 +797,7 @@ def _merge_parameter_with_dist_attr(param_list, dist_attr):
 
     assert len(partition_param_list) == 1 or not partition_param_list, \
         "Fail to merge parameter"
-    complete_param = _to_LodTensor(partition_param_list[0][0])
+    complete_param = partition_param_list[0][0]
     return complete_param
 
 
@@ -818,7 +817,7 @@ def _slice_parameter_with_dist_attr(param, dist_attr):
     rank_id = paddle.distributed.get_rank()
     sliced_param_index = _get_sliced_param_index(
         rank_id, param.shape, dims_mapping, process_shape, process_group)
-    sliced_param = _to_LodTensor(sliced_param_list[sliced_param_index])
+    sliced_param = sliced_param_list[sliced_param_index]
     return sliced_param
 
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 3731332d1e7774438d2b0369289a14f579339b07..bf6556d21e9fc78bc4bdc9d496b60dcb799b3d29 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -29,7 +29,6 @@ from ..fluid.layers import utils
 from ..fluid.dygraph import layers
 from ..fluid.dygraph.parallel import prepare_context
 import paddle
-from .fleet import fleet
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle import _C_ops
@@ -268,6 +267,10 @@ def new_group(ranks=None, backend=None):
                 place = core.NPUPlace(genv.device_id)
                 core.HCCLParallelContext(strategy,
                                          place).init_with_ring_id(ring_id)
+            elif core.is_compiled_with_mlu():
+                place = core.MLUPlace(genv.device_id)
+                core.CNCLParallelContext(strategy,
+                                         place).init_with_ring_id(ring_id)
             else:
                 assert False, ("no cuda device found")
         else:
@@ -1422,6 +1425,7 @@ def split(x,
             "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
             "ParallelColumnLinear instead.")
     else:
+        from .fleet import fleet
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
         rank = fleet.worker_index()
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index bc59b87e2ffa5c653e89c759f951de5f520773ba..236322ccfca6aad442e76af6f57c6c5f83ca59bb 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1430,6 +1430,22 @@ class Fleet(object):
 
         # cache original feed forward program
         self.origin_main_program = loss.block.program
+        # add distributed attr
+        if not hasattr(self.origin_main_program, "distributed_info_"):
+            setattr(self.origin_main_program, "distributed_info_", dict())
+            self.origin_main_program.distributed_info_[
+                "dp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "dp_degree"]
+            self.origin_main_program.distributed_info_[
+                "mp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "mp_degree"]
+            self.origin_main_program.distributed_info_[
+                "pp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "pp_degree"]
+            self.origin_main_program.distributed_info_[
+                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
+                    "sharding_degree"]
+
         context["origin_main_program"] = self.origin_main_program
         context["loss"] = loss
         if startup_program == None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 112c3887fcfa5d383d61a0dd32f6a0a73e5aea92..a2c741667ed77f7ffbd2e650ff162c2762a1ced6 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -25,10 +25,9 @@ from collections import OrderedDict
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-import paddle.distributed as dist
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
-from paddle.distributed.collective import _get_global_group
+from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait
 
 from ...utils.internal_storage import ParamStorage, GradStorage
 from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
@@ -40,8 +39,6 @@ align = {
     Type.fp32.value: 4,
 }
 
-__all__ = ["ShardingOptimizerStage2"]
-
 
 class ShardingOptimizerStage2(Optimizer):
     """
@@ -93,8 +90,8 @@ class ShardingOptimizerStage2(Optimizer):
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
-        self.group = dist.new_group(_get_global_group()
-                                    .ranks) if group is None else group
+        self.group = new_group(_get_global_group()
+                               .ranks) if group is None else group
 
         self.world_size = self.group.nranks
         self.rank = self.group.rank
@@ -136,21 +133,21 @@ class ShardingOptimizerStage2(Optimizer):
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self._update_opt_status()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
         """
 
         for p in self._local_params:
-            dist.broadcast(
+            broadcast(
                 p,
                 src=self._global_root_rank,
                 group=self.group,
                 use_calc_stream=True)
 
         # Multi stream operation will be supported later
-        dist.wait(tensor=p, group=self.group, use_calc_stream=True)
+        wait(tensor=p, group=self.group, use_calc_stream=True)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -387,12 +384,18 @@ class ShardingOptimizerStage2(Optimizer):
         raise RuntimeError(
             "optimizer.minimize() not support now, please use optimizer.step()")
 
+    def set_state_dict(self, state_dict):
+        self._optim.set_state_dict(state_dict)
+
+    def state_dict(self):
+        return self._optim.state_dict()
+
     def _clear_cache(self):
         self.__segment_params.clear()
         self._dtype_rank_params.clear()
         self._param2rank.clear()
 
-    @fluid.dygraph.no_grad
+    @paddle.autograd.no_grad()
     def _broadcast_params(self):
         """Broadcast the parameters of the current rank to each rank"""
 
@@ -401,14 +404,14 @@ class ShardingOptimizerStage2(Optimizer):
         # Exchange all the shards with the other ranks
         for dtype_per_rank in self.param_storages.values():
             for dst_rank, internal_storage in dtype_per_rank.items():
-                dist.broadcast(
+                broadcast(
                     tensor=internal_storage.buffer,
                     src=self.group.ranks[dst_rank],
                     group=self.group,
                     use_calc_stream=True)
 
             # Multi stream operation will be supported later
-            dist.wait(
+            wait(
                 tensor=internal_storage.buffer,
                 group=self.group,
                 use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 00937dbe7a43294a8415ea6773b915e702859b42..f786f665ad438c80988455824d6a206e3e240120 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -31,14 +31,19 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
-        self.pass_ctx = PassContext()
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(ParameterServerOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _set_origin_programs(self, losses):
+        self.origin_main_programs = []
+        for loss in losses:
+            self.origin_main_programs.append(loss.block.program)
+
     def _init_ps_pass_context(self, loss, startup_program):
+        self.pass_ctx = PassContext()
         attrs = {}
         # trainer
         attrs["env"] = get_dist_env()
@@ -46,9 +51,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         attrs['loss'] = loss
         attrs['min_block_size'] = 81920
         attrs['origin_main_program'] = loss.block.program
-        attrs['origin_main_programs'] = [loss.block.program]
         attrs['origin_startup_program'] = startup_program
-        attrs['origin_startup_programs'] = [startup_program]
+
+        attrs['origin_main_programs'] = self.origin_main_programs
 
         attrs['cloned_main'] = attrs['origin_main_program'].clone()
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
@@ -90,10 +95,11 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return False
 
     def _can_apply(self):
-        if self._attrs['role_maker']._is_collective or self._attrs[
-                'k_steps'] < 0:
+        if self.role_maker._is_collective:
             return False
-        return True
+
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        return True if k_steps >= 0 else False
 
     def minimize_impl(self,
                       loss,
@@ -104,12 +110,37 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                                 no_grad_set)
         if startup_program == None:
             startup_program = paddle.static.default_startup_program()
+        print("program after inner optimizer minimize:",
+              str(loss.block.program))
+        self._set_origin_programs([loss])
         self._init_ps_pass_context(loss, startup_program)
         ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
             self.pass_ctx)
         ps_builder._build_programs()
         return None, None
 
+    def minimize_losses_impl(self,
+                             losses,
+                             startup_program=None,
+                             parameter_list=None,
+                             no_grad_set=None):
+        if parameter_list is None:
+            parameter_list = [None] * len(losses)
+        for idx, loss in enumerate(losses):
+            startup_prog = startup_program[idx]
+            parameters = parameter_list[idx]
+            self.inner_opt.minimize(loss, startup_prog, parameters, no_grad_set)
+        self._set_origin_programs(losses)
+        for idx, loss in enumerate(losses):
+            print("ps_optimizer idx loss:", idx, loss)
+            startup_prog = startup_program[idx]
+            self._init_ps_pass_context(loss, startup_prog)
+            ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
+                self.pass_ctx)
+            ps_builder._build_programs()
+            startup_program[idx] = self.pass_ctx._attrs['cloned_startup']
+        return None, None
+
     def _can_apply_geo(self, program):
         def get_sys_free_mem():
             plat = platform.system()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index d04a3a53db3e2b6b0bc64e32dfde8a85732d6ec9..1a3a8a4883d8beb84181609740d2b836f548bc2c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -900,10 +900,12 @@ def save_persistables(exe, dirname, main_program, filename=None):
 
     def is_opt_vars(var):
         # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
-        # now only Momentum and adam are compatible with sharding
+        # now only Momentum and adam are compatible with sharding,
+        # support EMA optimizer with '_ema_0',
+        # support offload with '@offload_0' and '.cast_fp16'
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
-            "_velocity_0"
+            "_velocity_0", "_ema_0", "@offload_0", ".cast_fp16"
         ]
         for check in checks:
             if var.name.endswith(check) and var.persistable:
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 392a7f3ac5d8fe01ec2b5fdf0b36030d79124be4..c6f05023e6138597dfd906cd53854b70231d6130 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -28,7 +28,7 @@ from types import MethodType
 
 import paddle
 from paddle import nn
-import paddle.distributed as dist
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from ...utils.internal_storage import GradStorage
@@ -63,8 +63,7 @@ class ShardingStage2(nn.Layer):
             sync_buffers=False,
             buffer_max_size=2**23,  #8MB
             auto_refresh_trainable=True,
-            device="gpu",
-            use_grad_storage=True):
+            device="gpu"):
         super().__init__()
 
         # training options
@@ -102,9 +101,10 @@ class ShardingStage2(nn.Layer):
         # Set grad storage size & Display param sizes and model sizes
         model_size = sum(
             [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        assert buffer_max_size >= 0, "buffer_max_size must be GE than 0."
         self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                        model_size)
-        self._use_grad_storage = use_grad_storage
+        self._use_grad_storage = buffer_max_size > 0
         self._grad_storages = {}  # {dtype: {rank: GradStorage}}
         self._has_grad_storage = []
         self._grad_storage_list = []
@@ -158,6 +158,17 @@ class ShardingStage2(nn.Layer):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _clear_gradients(self):
         """
         Set zero to the gradient of the optimizer's current rank trainable parameters.
@@ -255,7 +266,7 @@ class ShardingStage2(nn.Layer):
         # wait next func hook support
         self._setup_backward_hooks()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def __sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -277,7 +288,7 @@ class ShardingStage2(nn.Layer):
         except AttributeError:
             return getattr(self._layer, name)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _clear_counters(self):
         """Reset all the grad reduce and call counters."""
         if self.training:
@@ -290,13 +301,13 @@ class ShardingStage2(nn.Layer):
     def _get_reduce_fn(self, index, param, dst_rank):
         """
         There are two ways to reduce gradient.
-        - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately.
+        - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately.
         - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks.
         """
 
         if not self._use_grad_storage or not self._has_grad_storage[index]:
             # Direct reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -336,7 +347,7 @@ class ShardingStage2(nn.Layer):
 
         else:
             # Buffer reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -421,9 +432,6 @@ class ShardingStage2(nn.Layer):
         Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters.
         """
 
-        if not self._use_grad_storage:
-            return
-
         # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank
         self._grad_storages = {}
         self._has_grad_storage = [False for _ in self._trainable_params]
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index de69836fdba14da14c179c5e00fa4cc14481e534..f96273cc84caf46f4f02c62e648ce70445b52d28 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -20,7 +20,6 @@ import logging
 import functools
 import numpy as np
 from itertools import chain
-from functools import reduce
 from types import MethodType
 from collections import deque, OrderedDict
 
@@ -28,9 +27,9 @@ import paddle
 from paddle import nn
 from paddle.autograd import PyLayer
 import paddle.fluid.core as core
-import paddle.distributed as dist
 from paddle.fluid.framework import ParamBase
 from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed import collective as dist
 from paddle.distributed.collective import _get_global_group
 
 from .sharding_utils import Type, ShardingClipGrad, device_guard
@@ -84,6 +83,7 @@ class ShardingStage3(nn.Layer):
         self._offload = offload
         self._sync_comm = sync_comm
         # segmentation size
+        assert segment_size >= 0, "segment_size must be GE than 0."
         self._segment_size = segment_size
 
         global DEV
@@ -158,7 +158,7 @@ class ShardingStage3(nn.Layer):
         self._redefine_opt_step()
         self._redefine_opt_clear()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
@@ -248,6 +248,17 @@ class ShardingStage3(nn.Layer):
 
         return fw
 
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self._layer.set_state_dict(
+            state_dict, use_structured_name=use_structured_name)
+
+    def state_dict(self,
+                   destination=None,
+                   include_sublayers=True,
+                   structured_name_prefix=""):
+        return self._layer.state_dict(
+            destination=None, include_sublayers=True, structured_name_prefix="")
+
     def _handle_unslice_params(self):
         buffer_size = dict()
         buffer_size[Type.fp32.value] = 0
@@ -408,7 +419,7 @@ class ShardingStage3(nn.Layer):
         # register post forward hooks
         sub_layer.register_forward_post_hook(_forward_post_hook)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -521,8 +532,8 @@ class ShardingStage3(nn.Layer):
             param._register_backward_hook(allreduce_function)
 
     def _get_allreduce_fn(self, param):
-        @paddle.no_grad()
-        def reduce(*_):
+        @paddle.autograd.no_grad()
+        def allreduce_(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
                 # Only support sync allreduce current rank's layer now
@@ -572,7 +583,7 @@ class ShardingStage3(nn.Layer):
                     if self._offload:
                         param.fw_storage = _device2cpu(param.fw_storage, True)
 
-        return reduce
+        return allreduce_
 
     def _param2align(self, param):
         # CUDA alignment 256 bytes
@@ -840,7 +851,7 @@ def _allgather_buffer(trainable_params,
     return task_flow
 
 
-@paddle.no_grad()
+@paddle.autograd.no_grad()
 def _create_params_grad(trainable_params, param2buffer_size, task_flow):
     for param in trainable_params:
         if param.name in task_flow.full_grad.keys():
@@ -901,7 +912,6 @@ def _device2cpu(trans_param, convert_dtype=False):
 
 def _cpu2device(param):
     tmp_p = param.fw_storage.cuda(DEV_ID)
-    param.fw_storage._clear()
     if tmp_p.dtype == Type.fp32.value and param2dtype[
             param.name] == Type.fp16.value:
         tmp_p = paddle.cast(tmp_p, Type.fp16.value)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 0a42b993d5bf2387d1110ae5478b6162ce175483..89b59254e5b9105a55c68f3ef871396de1bd9199 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -21,7 +21,6 @@ import numpy as np
 from types import MethodType
 
 import paddle
-import paddle.distributed as dist
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 177e19194a52277d99c7f3b7904bb00a3962f4c6..16ed528b64f0c87c594a9e302bf2ae398c0500c5 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -58,9 +58,9 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and (
+    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and (
             core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
-            core.is_compiled_with_npu()):
+            core.is_compiled_with_npu() or core.is_compiled_with_mlu()):
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -152,7 +152,8 @@ def init_parallel_env():
     is_cpu_only = _is_cpuonly(backend)
     # 1. gpu xpu check, must be gpu or xpu, 
     if not (is_cpu_only or core.is_compiled_with_cuda() or
-            core.is_compiled_with_xpu() or core.is_compiled_with_npu()):
+            core.is_compiled_with_xpu() or core.is_compiled_with_npu() or
+            core.is_compiled_with_mlu()):
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
@@ -162,6 +163,8 @@ def init_parallel_env():
         _check_var_exists('FLAGS_selected_xpus')
     elif not is_cpu_only and core.is_compiled_with_npu():
         _check_var_exists('FLAGS_selected_npus')
+    elif not is_cpu_only and core.is_compiled_with_mlu():
+        _check_var_exists('FLAGS_selected_mlus')
 
     _check_var_exists("PADDLE_TRAINER_ID")
     _check_var_exists("PADDLE_CURRENT_ENDPOINT")
@@ -213,6 +216,8 @@ def init_parallel_env():
         place = core.XPUPlace(parallel_env.device_id)
     elif core.is_compiled_with_npu():
         place = core.NPUPlace(parallel_env.device_id)
+    elif core.is_compiled_with_mlu():
+        place = core.MLUPlace(parallel_env.device_id)
 
     _set_expected_place(place)
     # init nccl or hccl or bkcl or heter context
@@ -231,6 +236,9 @@ def init_parallel_env():
     elif core.is_compiled_with_npu():
         parallel_helper._set_parallel_ctx(
             core.HCCLParallelContext(strategy, place))
+    elif core.is_compiled_with_mlu():
+        parallel_helper._set_parallel_ctx(
+            core.CNCLParallelContext(strategy, place))
 
     if backend != "heter":
         other_endpoints = strategy.trainer_endpoints[:]
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 284365ce06651ae6ca8e9ff5ad27fd92d33e8dae..6f72cf1b1597092b28f5762f1b3330b396ea6401 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -74,6 +74,8 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
+        print("pass loss program id:", id(attrs['loss'].block.program))
+        print("pass main program id:", id(main_program))
         ps_mode = attrs['ps_mode']
         if ps_mode == DistributedMode.GEO:
             send_ctx = get_geo_trainer_send_context(attrs)  # geo 模式
@@ -84,6 +86,8 @@ class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
         for merged_name, send in send_ctx.items():
             if send.is_sparse() and ps_mode != DistributedMode.GEO:
                 continue
+            if send.program_id() != id(attrs['loss'].block.program):
+                continue
             logger.info('merged_name, send: {}, {}'.format(merged_name, send))
             is_sparse = 1 if send.is_sparse() else 0
             is_sparse = 2 if send.is_distributed() else is_sparse
@@ -496,6 +500,7 @@ class DeleteOptimizesPass(PassBase):
             persistable=True)
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
+        print("delete_optimizer_pass")
         attrs = pass_ctx._attrs
         optimizer_ops = get_optimize_ops(main_program)
         lr_ops = get_lr_ops(main_program)
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index cc744bc9d9edbdaa09b6857a069732f83e4b59ac..5170684b4325c1d2ab6723a0b8d8989cf9c21aa2 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -40,12 +40,12 @@ def get_program_by_id(context, program_id):
     programs = context["origin_main_programs"]
     for i, program in enumerate(programs):
         if id(program) == program_id:
-            return program, context["origin_startup_programs"][i]
-    return None, None
+            return program, context["origin_startup_programs"][i], i
+    return None, None, None
 
 
 def parse_table_class(varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     for op in main_program.global_block().ops:
         if not is_distributed_sparse_op(op) and not is_sparse_op(op):
             continue
@@ -60,7 +60,7 @@ def parse_table_class(varname, program_id, context):
 
 
 def check_embedding_dim(accessor_proto, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     embedding_dim = 0
     for var in main_program.list_vars():
         if var.name == varname:
@@ -94,10 +94,9 @@ class Service:
 
 class GpuService(Service):
     def __init__(self):
-        super(GpuService).__init__(self)
+        super(GpuService, self).__init__()
 
     def _set(self, service_proto):
-        super(GpuService)._set(service_proto)
         service_proto.server_class = 'PsLocalServer'
         service_proto.client_class = 'PsLocalClient'
 
@@ -111,7 +110,8 @@ class Accessor:
 
     # TableAccessorParameter accessor
     def _set(self, accessor_proto, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         embedding_dim = 0
         for var in main_program.list_vars():
             if var.name == varname:
@@ -236,7 +236,8 @@ class CommonAccessor(Accessor):
         self.opt_init_map = opt_init_map
 
     def parse_entry(self, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
@@ -290,8 +291,8 @@ class CommonAccessor(Accessor):
         print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
             ctx.table_id(), ctx.is_datanorm_table()))
 
-        main_program, startup_program = get_program_by_id(context,
-                                                          ctx.program_id())
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
@@ -359,10 +360,11 @@ class CommonAccessor(Accessor):
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
                     #TODO: for dense learning_rate, can be different from sparse lr
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     initializer = self.get_initializer_attr(param.name,
                                                             startup_program)
@@ -404,10 +406,11 @@ class CommonAccessor(Accessor):
                 else:
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     if shape is None:
                         if is_sparse:
@@ -707,6 +710,7 @@ class PsDescBuilder(object):
         self.ps_mode = context['ps_mode']
         self.is_heter_ps_mode = context['is_heter_ps_mode']
         self.use_ps_gpu = context['use_ps_gpu']
+        self.barrier_table_id = None
         self.send_ctx = get_the_one_send_context(
             self.context,
             use_origin_program=True,
@@ -767,6 +771,8 @@ class PsDescBuilder(object):
             table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
             )
             table._set(table_proto)
+            if type(table) == BarrierTable and self.barrier_table_id is None:
+                self.barrier_table_id = table.idx
         self.service._set(
             self.ps_desc.server_param.downpour_server_param.service_param)
         return text_format.MessageToString(self.ps_desc)
@@ -820,9 +826,9 @@ class TheOnePSRuntime(RuntimeBase):
         self.context['tensor_table'] = {}
         build_var_distributed(self.context)
 
-        endpoints = get_ps_endpoints(self.role_maker)
+        self.endpoints = get_ps_endpoints(self.role_maker)
         self.string_hosts = []
-        for idx, ep in enumerate(endpoints):
+        for idx, ep in enumerate(self.endpoints):
             host, port = ep.split(":")
             pshost = fluid.core.PSHost(host, int(port), idx)
             self.string_hosts.append(pshost.serialize_to_string())
@@ -848,7 +854,7 @@ class TheOnePSRuntime(RuntimeBase):
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = worker_desc + "\n" + server_desc
+        proto_txt = worker_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("worker: \n{}".format(proto_txt))
@@ -859,7 +865,7 @@ class TheOnePSRuntime(RuntimeBase):
             self.context,
             split_dense_table=self.is_heter_ps_mode,
             use_origin_program=self.is_heter_ps_mode,
-            ep_list=endpoints)
+            ep_list=self.endpoints)
         trainer_config = self.context['trainer']
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
@@ -876,10 +882,7 @@ class TheOnePSRuntime(RuntimeBase):
         kwargs["trainer_id"] = self.role_maker._role_id()
         kwargs["trainers"] = self.role_maker._worker_num()
 
-        for table in server.servers[0].tables:  #TODO
-            if table.table_class == "BarrierTable":
-                kwargs["barrier_table_id"] = table.id
-                break
+        kwargs["barrier_table_id"] = self.ps_desc_builder.barrier_table_id
 
         if self.context['ps_mode'] == DistributedMode.SYNC:
             sync_kwargs = sync_strategy_envs()
@@ -1009,7 +1012,7 @@ class TheOnePSRuntime(RuntimeBase):
             if origin_varname.endswith("@GRAD"):
                 return False
 
-            if origin_varname == "learning_rate_0":
+            if origin_varname.startswith("learning_rate_"):
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
@@ -1113,7 +1116,7 @@ class TheOnePSRuntime(RuntimeBase):
                 "in fleet.save() function, executor must be as Executor type")
 
         if main_program is None:
-            main_program = self.context['origin_ps_main_program']
+            main_program = self.context['origin_main_program']
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index d737542f3234480717826c9a8902fd69f178cdea..ff99f9d071e2f4321df83a30bded1fb1678355d3 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -88,7 +88,7 @@ class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
         self.attrs['origin_main_program'] = self.cloned_main
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -103,10 +103,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
                              format(self.ps_mode, "PsProgramBuilder"))
 
     def _build_trainer_programs(self):
+        print("build trainer program entry")
+        print("before ps program builder program:", self.cloned_main)
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                            self.attrs)
         add_lr_decay_table_pass.apply([], [], self.pass_ctx)
 
+        print("before distributed op pass")
         distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
         distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
 
@@ -126,9 +129,10 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
 
         self.attrs['origin_main_program'] = self.cloned_main
         self.attrs['origin_startup_program'] = self.cloned_startup
+        print("after ps program builder program:", self.cloned_main)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -167,7 +171,7 @@ class GpuPsProgramBuilder(PsProgramBuilder):
         self.attrs['origin_startup_program'] = self.cloned_startup
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -220,7 +224,7 @@ class HeterAsyncPsProgramBuilder(PsProgramBuilder):
                                           [self.cloned_startup], self.pass_ctx)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ab5bd7da09dfc16035f6c6d62ad2dc7f8c5fea40..7839c8520c68ff16497945f49fed99f2a1d9018e 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -450,9 +450,8 @@ def get_the_one_send_context(context,
     idx = 0
     for i, program in enumerate(origin_programs):
         merged_dense_pairs = context['merged_dense_pairs'][i]
-        idx += get_dense_send_context(program, send_ctx, idx,
-                                      merged_dense_pairs, trainer_id,
-                                      split_dense_table)
+        idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
+                                     trainer_id, split_dense_table)
     distibuted_varnames = get_sparse_tablenames(origin_programs, True)
     print("public distibuted_varnames:", distibuted_varnames)
     for i, program in enumerate(origin_programs):
diff --git a/python/paddle/distributed/run/__init__.py b/python/paddle/distributed/run/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f25ddb794cc4d573429ac960e646bd8125c48d16
--- /dev/null
+++ b/python/paddle/distributed/run/__init__.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .job.container import Container
+from .job.pod import Pod
+from .job.job import Job
+from . import plugins
+
+#__all__ = [Container, Pod, Job]
+'''
+Paddle distribution training entry ``python -m paddle.distributed.run``.
+
+Help
+
+# for arg usage and explanation, try the following command
+# python -m paddle.distributed.run -h
+
+Collective Mode
+
+Case 1: 1 node
+
+use all visible devices
+# python -m paddle.distributed.run train.py
+
+use specified devices
+# python -m paddle.distributed.run --devices=0,1,2,3 train.py
+
+Case 2: multi-node, auto detect ip/port
+
+# python -m paddle.distributed.run --np 2 train.py
+# auto print following command
+# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py
+# then copy and paste above command to other nodes
+
+Case 3: multi-node, specified master/rendezvous server
+
+# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py
+# the master ip must be one of the node and the port must available
+
+Parameter Server Mode
+
+Case 1.1: 1 node, 1 ps, 1 trainer
+
+# python -m paddle.distributed.run --mode ps train.py
+# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py
+
+Case 1.2: 1 node, 2 ps, 2 trainer
+
+# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py
+
+Case 2: 2 node, 2 ps, 2 trainer per node
+
+# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py
+# auto print following command
+# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# then copy and paste above command to other nodes
+
+Case 3: multi-node, specified master/rendezvous server
+
+# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# the master ip must be one of the node and the port must available
+
+Case 4: specified servers and trainers in each node
+
+python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
+
+
+Elastic Mode
+
+# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
+# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py
+
+# once the peer number changes between 2:3, the strategy holds
+
+'''
diff --git a/python/paddle/distributed/run/__main__.py b/python/paddle/distributed/run/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e32df59a328081e33aa86b42ed9b8e489ac399e8
--- /dev/null
+++ b/python/paddle/distributed/run/__main__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import Context
+from . import controllers
+
+# initialize the context to run
+ctx = Context()
+
+# initialize the selected controller
+c = controllers.init(ctx)
+
+# run the pods
+c.run()
+
+# manager or just wait pod
+c.finalize()
diff --git a/python/paddle/distributed/run/context/__init__.py b/python/paddle/distributed/run/context/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86dff0f1f8056e784268a6ef3a3ebabb44aa9c6d
--- /dev/null
+++ b/python/paddle/distributed/run/context/__init__.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser, REMAINDER
+import os, copy
+
+from paddle.distributed.run import plugins
+
+from .node import Node
+from .status import Status
+
+import logging
+
+
+class Context(object):
+    def __init__(self, enable_plugin=True):
+        os.environ.pop('http_proxy', None)
+        os.environ.pop('https_proxy', None)
+
+        self.args = self.parse_args()
+        self.envs = self.fetch_envs()
+        self.logger = self.get_logger()
+
+        self.node = Node()
+        self.status = Status()
+
+        self.set_env_in_args()
+
+        # design for event queue, later
+        self.events = []
+
+        if enable_plugin:
+            self._enable_plugin()
+
+    def get_envs(self):
+        return self.envs.copy()
+
+    def _enable_plugin(self):
+        for pl in plugins.enabled_plugins:
+            pl(self)
+
+    def parse_args(self):
+        parser = ArgumentParser()
+
+        base_group = parser.add_argument_group("Base Parameters")
+
+        base_group.add_argument(
+            "--master",
+            type=str,
+            default=None,
+            help="the master/rendezvous server, ip:port")
+
+        base_group.add_argument(
+            "--rank", type=int, default=-1, help="the peer rank")
+
+        base_group.add_argument(
+            "--log", type=str, default="INFO", help="log level. Default INFO")
+
+        base_group.add_argument(
+            "--np",
+            type=str,
+            default="1",
+            help="the number of peers, i.e. pod/node number")
+
+        base_group.add_argument(
+            "--nproc_per_node",
+            type=int,
+            default=None,
+            help="the number of processes in a pod")
+
+        base_group.add_argument(
+            "--log_dir",
+            type=str,
+            default="log",
+            help="the path for each process's log. Default ./log")
+        base_group.add_argument(
+            "--mode",
+            type=str,
+            default="collective",
+            help="run mode of the job, collective/ps/ps-heter")
+
+        base_group.add_argument(
+            "--id",
+            type=str,
+            default="default",
+            help="unique id of the job. Default default")
+
+        base_group.add_argument(
+            "--devices",
+            type=str,
+            default=None,
+            help="accelerate devices. as --gpus,npus,xps")
+
+        base_group.add_argument(
+            "--host", type=str, default=None, help="host ip")
+
+        base_group.add_argument(
+            "training_script",
+            type=str,
+            help="the full path of py script,"
+            "followed by arguments for the "
+            "training script")
+
+        base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+        ps_group = parser.add_argument_group("Parameter-Server Parameters")
+        # for parameter server
+        ps_group.add_argument(
+            "--servers",
+            type=str,
+            default='',
+            help="servers endpoints full list")
+        ps_group.add_argument(
+            "--trainers",
+            type=str,
+            default='',
+            help="trainers endpoints full list")
+
+        ps_group.add_argument(
+            "--trainer_num", type=int, default=None, help="number of trainers")
+        ps_group.add_argument(
+            "--server_num", type=int, default=None, help="number of servers")
+        ps_group.add_argument(
+            "--gloo_port", type=int, default=6767, help="gloo http port")
+        ps_group.add_argument(
+            "--with_gloo", type=str, default="0", help="use gloo or not")
+
+        # parameter elastic mode
+        elastic_group = parser.add_argument_group("Elastic Parameters")
+        elastic_group.add_argument(
+            "--max_restart",
+            type=int,
+            default=3,
+            help="the times can restart. Default 3")
+
+        elastic_group.add_argument(
+            "--elastic_level",
+            type=int,
+            default=-1,
+            help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+        )
+
+        elastic_group.add_argument(
+            "--elastic_timeout",
+            type=int,
+            default=30,
+            help="seconds to wait before elastic perform training")
+        return parser.parse_args()
+
+    def _valide_env(self, key):
+        if key in ['POD_IP']:
+            return True
+        if key.endswith('_VISIBLE_DEVICES'):
+            return True
+        if key.startswith('PADDLE_'):
+            return True
+
+        return False
+
+    def fetch_envs(self):
+        ge = os.environ.copy()
+
+        black_env_list = ['http_proxy', 'https_proxy']
+        for key in black_env_list:
+            ge.pop(key, None)
+
+        return ge
+        '''
+        # use black list instead white list
+        return {k: ge[k] for k in ge if self._valide_env(k)}
+        '''
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("PADDLERUN")
+        logger.setLevel(self.args.log.upper() or level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def set_env_in_args(self):
+        env_args = {
+            'POD_IP': 'host',
+            'PADDLE_MASTER': 'master',
+            'PADDLE_DEVICES': 'devices',
+            'PADDLE_NP': 'np',
+            'PADDLE_MODE': 'mode',
+            'PADDLE_LOG': 'log',
+            'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
+            'PADDLE_JOB_ID': 'id',
+            'PADDLE_RANK': 'rank',
+            'PADDLE_LOG_DIR': 'log_dir',
+            'PADDLE_MAX_RESTlRT': 'max_restart',
+            'PADDLE_ELASTIC_LEVEL': 'elastic_level',
+            'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
+            'PADDLE_SERVER_NUM': 'server_num',
+            'PADDLE_TRAINER_NUM': 'trainer_num',
+            'PADDLE_SERVERS_ENDPOINTS': 'servers',
+            'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
+            'PADDLE_GLOO_PORT': 'gloo_port',
+            'PADDLE_WITH_GLOO': 'with_gloo',
+        }
+
+        for k, v in env_args.items():
+            if k in self.envs:
+                setattr(self.args, v, self.envs[k])
diff --git a/python/paddle/distributed/run/context/device.py b/python/paddle/distributed/run/context/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8bbd851ccf83a1ebfac60758576384bbe1aa4f4
--- /dev/null
+++ b/python/paddle/distributed/run/context/device.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class DeviceType:
+    CPU = 'cpu'
+    GPU = 'gpu'
+    XPU = 'xpu'
+    NPU = 'npu'
+
+
+class Device(object):
+    def __init__(self, dtype=None, count=1, memory="", labels=""):
+        self.dtype = dtype
+        self.count = count
+        self.memory = memory
+        self.labels = labels
+
+    def __str__(self):
+        return ",".join(self.labels)
+
+    @classmethod
+    def parse_device(self):
+        dev = Device()
+        visible_devices = None
+        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
+            dev.dtype = DeviceType.GPU
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif 'XPU_VISIBLE_DEVICES' in os.environ:
+            dev.dtype = DeviceType.XPU
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
+            dev.dtype = DeviceType.NPU
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+
+        if visible_devices and visible_devices != 'all':
+            dev.labels = visible_devices.split(',')
+            dev.count = len(dev.labels)
+        else:
+            return self.detect_device()
+
+        return dev
+
+    @classmethod
+    def detect_device(self):
+        import paddle.fluid as fluid
+
+        dev = Device()
+        num = 0
+        visible_devices = None
+        if fluid.core.is_compiled_with_cuda():
+            dev.dtype = DeviceType.GPU
+            num = fluid.core.get_cuda_device_count()
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_xpu():
+            dev.dtype = DeviceType.XPU
+            num = fluid.core.get_xpu_device_count()
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_npu():
+            dev.dtype = DeviceType.NPU
+            num = fluid.core.get_npu_device_count()
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+
+        if num == 0:
+            dev.dtype = DeviceType.CPU
+        elif visible_devices is None or visible_devices == "all" or visible_devices == "":
+            dev.labels = [str(x) for x in range(0, num)]
+            dev.count = num
+        else:
+            dev.labels = visible_devices.split(',')
+            dev.count = len(dev.labels)
+
+        return dev
diff --git a/python/paddle/distributed/run/context/event.py b/python/paddle/distributed/run/context/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e8e7a5014002b480b717623dec2d5ee62eb743
--- /dev/null
+++ b/python/paddle/distributed/run/context/event.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Event(object):
+    def __init__(self, kind="status", message="", fatal=False):
+        self.kind = kind
+        self.message = message
+        self.fatal = fatal
diff --git a/python/paddle/distributed/run/context/node.py b/python/paddle/distributed/run/context/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ece4db0fbbeed379c2cda343022dd371a9e7540
--- /dev/null
+++ b/python/paddle/distributed/run/context/node.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device import Device
+
+import socket
+import struct
+from contextlib import closing
+
+
+class Node(object):
+    def __init__(self):
+        # self.device = Device.detect_device()
+        self.device = Device.parse_device()
+        self.ip = self.get_host_ip()
+        self.free_ports = []
+
+    def get_host_ip(self):
+        try:
+            self.hostname = socket.gethostname()
+            self.ip = socket.gethostbyname(socket.getfqdn(self.hostname))
+            return self.ip
+        except:
+            return '127.0.0.1'
+
+    def get_free_ports(self, n=1):
+        free_ports = [self.get_free_port() for i in range(n)]
+        self.free_ports += free_ports
+        return free_ports
+
+    def get_ports_occupied(self):
+        return self.free_ports
+
+    @classmethod
+    def get_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+                         struct.pack('ii', 1, 0))
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    @classmethod
+    def is_server_ready(self, ip, port):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+            #sock.settimeout(0.01)
+            #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            if hasattr(socket, 'SO_REUSEPORT'):
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+            result = sock.connect_ex((ip, int(port)))
+            if result == 0:
+                return True
+            else:
+                return False
diff --git a/python/paddle/distributed/run/context/resource.py b/python/paddle/distributed/run/context/resource.py
new file mode 100644
index 0000000000000000000000000000000000000000..faffed704c1f078f9fed131ef1ade98add60b5d9
--- /dev/null
+++ b/python/paddle/distributed/run/context/resource.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Resource(object):
+    def __init__(self):
+        self.devices = []
diff --git a/python/paddle/distributed/run/context/status.py b/python/paddle/distributed/run/context/status.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbf3623ec22ed56b5ce136d8a6813291be69e8f
--- /dev/null
+++ b/python/paddle/distributed/run/context/status.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
+    DONE = "done"  # should exit whatever status
+
+    def __init__(self):
+        self._current_status = None
+
+    def current(self):
+        return self._current_status
+
+    def is_running(self):
+        return self._current_status == self.RUNNING
+
+    def is_restarting(self):
+        return self._current_status == self.RESTARTING
+
+    def is_done(self):
+        if self._current_status in [self.DONE, self.COMPLETED, self.FAILED]:
+            return True
+        else:
+            return False
+
+    def run(self):
+        self._current_status = self.RUNNING
+
+    def fail(self):
+        self._current_status = self.FAILED
+
+    def complete(self):
+        self._current_status = self.COMPLETED
+
+    def restart(self):
+        self._current_status = self.RESTARTING
+
+    def done(self):
+        self._current_status = self.DONE
diff --git a/python/paddle/distributed/run/controllers/__init__.py b/python/paddle/distributed/run/controllers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5557151ad5489cb4af0c34b3ad47c31774b3326
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["init"]
+
+from .collective import CollectiveController
+from .collective import CollectiveElasticController
+from .ps import PSController
+
+# the order is extremely important
+_controllers = [
+    CollectiveElasticController,
+    PSController,
+    CollectiveController,
+]
+
+
+def init(ctx):
+    for c in _controllers:
+        if c.enable(ctx):
+            return c(ctx)
diff --git a/python/paddle/distributed/run/controllers/collective.py b/python/paddle/distributed/run/controllers/collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4feb54428a07265693c0969e6e385a380e22f3d
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/collective.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller
+
+import json
+import os
+import six
+import time
+
+
+class CollectiveController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        self.pod.replicas = self.pod_replicas()
+
+        # rank will be reset when restart
+        self.pod.rank = self.ctx.args.rank
+
+        port = self.ctx.node.get_free_port()
+
+        # compatible
+        endpoints = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(self.pod.replicas)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'replicas': self.pod.replicas,
+            'dtype': self.ctx.node.device.dtype,
+            'candidate': '{}:{}'.format(self.ctx.node.ip, port),
+            'endpoints': ",".join(endpoints),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+        self.pod.rank = rank
+
+        if len(peer_list) < 1:
+            return False
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+        self.save_pod_log(peer_list)
+
+        global_size = sum([i['replicas'] for i in peer_list])
+        rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+        '''
+        The new designed collective need nothing but a master endpoint
+        '''
+        collective_master = peer_list[0]['candidate']
+
+        job_endpoints = [i['endpoints'] for i in peer_list]
+
+        self.pod.reset()
+        for i in range(self.pod.replicas):
+            e = {
+                "PADDLE_MASTER": collective_master,
+                "PADDLE_GLOBAL_SIZE": "{}".format(global_size),
+                "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
+                "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
+                "PADDLE_LOCAL_RANK": "{}".format(i),
+                ## compatible env
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
+                "PADDLE_CURRENT_ENDPOINT": endpoints[i],
+                "PADDLE_TRAINER_ID": "{}".format(i + rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(global_size),
+                "PADDLE_RANK_IN_NODE": str(i),
+            }
+            self.add_container(envs=e, log_tag=i)
+
+        return True
+
+
+class CollectiveElasticController(CollectiveController):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith("etcd://"):
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def register(self):
+        if self.job.id == 'default':
+            self.ctx.logger.warning(
+                'Using default job name may cause conflict, add --id in args')
+
+        self.master.register_heartbeat(self.job.id, self.pod.name)
+
+    def watch(self) -> bool:
+        '''
+        watch self and peer status, return true to exit
+        '''
+        while not self.ctx.status.is_done():
+            # self status
+            status = self.pod.watch(timeout=2)
+            self.ctx.logger.debug("Pod status {}, Ctx status {}".format(
+                status, self.ctx.status.current()))
+
+            # completed
+            if status == self.ctx.status.COMPLETED:
+                self.master.set_status(status)
+                self.ctx.status.complete()
+                self.ctx.logger.info("Pod complete {}".format(status))
+                return True
+
+            # self failure
+            elif status == self.ctx.status.FAILED:
+                self.master.set_status(status)
+                self.master.restart_peer()
+                self.ctx.logger.info("Pod failed {}".format(status))
+                self.pod.stop()
+
+                if self.ctx.args.elastic_level <= 0:
+                    return True
+                else:
+                    return False
+
+            # peer failure
+            if self.ctx.status.is_restarting() and self.master.get_status(
+            ) != self.ctx.status.COMPLETED:
+                self.pod.stop()
+                return False
+
+            #peers = self.master.fetch_peer_alive()
+            #print("peers {}".format(peers))
+
+    def run(self):
+
+        timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
+        self.register()
+
+        while self.pod.restart <= self.ctx.args.max_restart:
+
+            self.build_job()
+
+            ok, replicas = self.master.wait_peer_ready(
+                self.job.replicas_min, self.job.replicas_max, timeout)
+            if ok:
+                self.job.replicas = replicas
+            else:
+                self.ctx.logger.warnning("peer not ready {}".format(self.job))
+                break
+
+            self.ctx.logger.debug("Run {}".format(self.job))
+
+            if not self.build_pod():
+                continue
+
+            self.master.set_status(self.ctx.status.RUNNING)
+            self.ctx.status.run()
+
+            assert len(self.pod.containers) > 0, "No container in the pod"
+            self.ctx.logger.debug("Run {}".format(self.pod))
+            self.ctx.logger.debug("Run {}".format(self.pod.containers[0]))
+
+            self.pod.deploy()
+
+            if self.watch():
+                break
+
+        self.ctx.logger.debug("Job done {}".format(self.job))
diff --git a/python/paddle/distributed/run/controllers/controller.py b/python/paddle/distributed/run/controllers/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d904cf2a2cca5b9abaab06d1545c03c160e3d93
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/controller.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import signal
+
+from paddle.distributed.run.job import Job
+from paddle.distributed.run.job import Pod
+from paddle.distributed.run.job import Container
+
+from .master import Master
+
+import time
+
+
+class ControleMode:
+    COLLECTIVE = "collective"
+    PS = "ps"
+
+
+class ControllerBase(object):
+    def __init__(self, ctx):
+        signal.signal(signal.SIGTERM, self.signal_handler)
+        signal.signal(signal.SIGABRT, self.signal_handler)
+        signal.signal(signal.SIGINT, self.signal_handler)
+
+        self.ctx = ctx
+        self.master = Master.factory(self.ctx)
+
+        self.job = Job(np=self.ctx.args.np,
+                       mode=self.ctx.args.mode,
+                       id=self.ctx.args.id)
+        self.pod = Pod()
+
+        self.join_server = None
+
+    def run(self):
+        self.build_job()
+        self.build_pod()
+
+        if len(self.pod.containers) < 1:
+            self.ctx.logger.error("No container in the pod {}".format(self.pod))
+            return
+
+        self.ctx.logger.info("Run {}".format(self.pod))
+        self.ctx.logger.debug(self.pod.containers[0])
+
+        self.pod.deploy()
+
+        self.watch()
+
+    def watch(self) -> bool:
+        status = self.pod.watch()
+
+        if status == self.ctx.status.COMPLETED:
+            self.ctx.logger.info("Pod {}".format(status))
+        elif status == self.ctx.status.FAILED:
+            self.ctx.logger.info("Pod {}".format(status))
+            self.ctx.logger.error("Container failed !!!\n{}".format(
+                self.pod.failed_container()))
+            self.pod.tail()
+            self.pod.stop()
+
+    def stop(self, sigint=None):
+        self.ctx.logger.debug("Controller stop")
+        self.master.stop()
+        self.pod.stop(sigint)
+
+    def finalize(self):
+        self.pod.join()
+        self.master.stop()
+
+        self.ctx.logger.info("Exit code {}".format(self.pod.exit_code))
+        sys.exit(self.pod.exit_code)
+
+    def signal_handler(self, sigint, frame):
+        self.ctx.logger.info("Terminating with signal {}".format(sigint))
+
+        if hasattr(self, 'sigint'):
+            time.sleep(5)
+            sys.exit(sigint)
+
+        self.sigint = sigint
+        self.ctx.status.done()
+        self.stop(sigint)
+        time.sleep(1)
+        self.ctx.logger.debug("Exit with signal {}".format(sigint))
+        sys.exit(sigint)
+
+
+class Controller(ControllerBase):
+    '''
+    Controller API for customization
+    '''
+
+    def build_job(self):
+        '''
+        build job fill the job info.
+        '''
+        self.ctx.logger.info(self.job)
+
+    def build_pod(self) -> bool:
+        '''
+        build pod includes creating containers etc.
+
+        Return True if succeed
+        '''
+        raise NotImplementedError
+
+    def _get_entrypoint(self):
+        entrypoint = [sys.executable, "-u", self.ctx.args.training_script]
+        entrypoint.extend(self.ctx.args.training_script_args)
+        return entrypoint
+
+    def _get_out_err_file(self, out=None, err=None):
+        if out and self.ctx.args.log_dir != "":
+            out = os.path.join(self.ctx.args.log_dir, out)
+        if err and self.ctx.args.log_dir != "":
+            err = os.path.join(self.ctx.args.log_dir, err)
+        return out, (err or out)
+
+    def new_container(self,
+                      entrypoint=None,
+                      envs={},
+                      use_ctx_env=True,
+                      out=None,
+                      err=None):
+        c = Container(
+            entrypoint=(entrypoint or self._get_entrypoint()),
+            env=(self.ctx.get_envs() if use_ctx_env else {}), )
+        c.outfile, c.errfile = self._get_out_err_file(out, err)
+        c.update_env(envs)
+        return c
+
+    def add_container(self,
+                      container=None,
+                      entrypoint=None,
+                      envs={},
+                      log_tag=None,
+                      is_init=False):
+        if not is_init and log_tag is not None:
+            log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name,
+                                             log_tag)
+        else:
+            log_file = None
+
+        if not container:
+            container = self.new_container(
+                entrypoint=entrypoint, envs=envs, out=log_file, err=log_file)
+
+        if is_init:
+            self.pod.add_init_container(container)
+        else:
+            self.pod.add_container(container)
+
+    def pod_replicas(self):
+        '''
+        how many process/container should be run in pod
+        '''
+
+        if self.ctx.args.nproc_per_node:
+            return int(self.ctx.args.nproc_per_node)
+        else:
+            return self.ctx.node.device.count
+
+    def save_pod_log(self, info):
+        '''
+        save_pod_log append *info* to the log file of pod.name
+        '''
+        if not self.ctx.args.log_dir:
+            return
+
+        f = os.path.join(self.ctx.args.log_dir,
+                         '{}.{}.log'.format(self.job.id, self.pod.name))
+        try:
+            os.makedirs(os.path.dirname(f), exist_ok=True)
+            with open(f, 'a+') as fd:
+                fd.write(str(info))
+        except Exception as e:
+            self.ctx.logger.error("save log failed because {}".format(e))
diff --git a/python/paddle/distributed/run/controllers/master.py b/python/paddle/distributed/run/controllers/master.py
new file mode 100644
index 0000000000000000000000000000000000000000..257ba3bad8da3c331ac303b7a3ee415461fd13b8
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/master.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.run.utils.kv_client import KVClient
+from paddle.distributed.run.utils.kv_server import KVServer
+
+import time
+import sys
+import six
+import threading
+import copy
+import random
+
+ETCD_PROTOCAL = 'etcd://'
+
+
+class Master(object):
+    '''
+    Master is a distributed store design to exchange info among nodes
+    '''
+
+    MAIN = "main"
+    STANDBY = "standby"
+    PATICIPANT = "participant"
+
+    def __init__(self, ctx):
+        self.ctx = ctx
+        self.server = None
+        self.initialized = False
+        self.endpoint = None
+
+    def stop(self):
+        raise NotImplementedError
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        raise NotImplementedError
+
+    @classmethod
+    def factory(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith(ETCD_PROTOCAL):
+            return ETCDMaster(ctx)
+        else:
+            return HTTPMaster(ctx)
+
+
+class HTTPMaster(Master):
+    def lazy_init(self):
+        if self.initialized:
+            return
+
+        self.role = Master.PATICIPANT
+
+        if self.ctx.args.master:
+            self.endpoint = self.ctx.args.master
+            ip, port = self.endpoint.split(':')
+            if ip in ['127.0.0.1', self.ctx.node.ip]:
+                time.sleep(2 * random.random())
+                while not self.ctx.node.is_server_ready(ip, int(port)):
+                    try:
+                        self.server = KVServer(int(port))
+                        self.role = Master.MAIN
+                        break
+                    except Exception as e:
+                        self.ctx.logger.warning("start master failed {}".format(
+                            e))
+                        time.sleep(0.1)
+                        continue
+        else:
+            port = self.ctx.node.get_free_port()
+            self.endpoint = "{}:{}".format(self.ctx.node.ip, port)
+            self.server = KVServer(port)
+            self.role = Master.MAIN
+
+            print("Copy the following command to other nodes to run.")
+            cmd = [
+                sys.executable.split('/')[-1], "-m", "paddle.distributed.run"
+            ]
+            cmd.extend(["--master", self.endpoint])
+            cmd.extend(sys.argv[1:])
+            print("-" * 80)
+            print(" ".join(cmd))
+            print("-" * 80)
+
+            if self.ctx.args.rank >= 0:
+                self.ctx.logger.warning(
+                    "--rank set in the command may not compatible in auto mode")
+
+        if '127.0.0.1' in self.endpoint:
+            self.endpoint = self.endpoint.replace('127.0.0.1', self.ctx.node.ip)
+        self.client = KVClient(self.endpoint)
+
+        self.initialized = True
+
+        self._start_server()
+
+    def _start_server(self):
+        if self.server and not self.server.started:
+            self.server.start()
+            self.ctx.logger.debug("KV server start at {}".format(self.endpoint))
+
+    def _stop_server(self):
+        if self.server and not self.server.stopped:
+            self.server.stop()
+            self.ctx.logger.debug("KV server stopped")
+
+    def stop(self):
+        self._stop_server()
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        if size < 2:
+            return [value], 0
+
+        self.lazy_init()
+
+        while not self.ctx.status.is_done():
+            if self.client.wait_server_ready(timeout=5):
+                break
+            else:
+                self.ctx.logger.warning("master not ready")
+                time.sleep(0.1)
+
+        # 'aaaaaa' make suer main pod (master server) as rank 0
+        ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
+        k = "{}/{}/{}".format(prefix, ky, rank)
+
+        while not self.ctx.status.is_done():
+            if not self.client.put(k, value):
+                self.ctx.logger.warning("put value failed")
+                time.sleep(0.1)
+                continue
+
+            rjson = self.client.get_prefix(prefix)
+            self.ctx.logger.debug("sync peers {}".format(rjson))
+            if rjson and len(rjson) == size:
+                if rank < 0:
+                    keys = list(rjson.keys())
+                    keys.sort()
+                    ret = [rjson[k] for k in keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for k, v in rjson.items():
+                        ret[int(k.split('/')[-1])] = v
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+        return [], 0
+
+
+class ETCDMaster(Master):
+    def __init__(self, ctx):
+        super().__init__(ctx)
+
+        if self.ctx.args.master:
+            # etcd://localhost:2379
+            self.endpoint = self.ctx.args.master.strip("etcd://")
+
+        import etcd3
+
+        host, port = self.endpoint.split(':')
+        self.client = etcd3.client(host=host, port=port)
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        '''
+        sync_peers gather all value for key under scope prefix
+        result always be sorted either by rank or alphabet of pod.name
+        '''
+        path = "{}/{}/{}".format(prefix, key, rank)
+
+        self.client.delete_prefix(prefix)
+
+        self.ctx.logger.debug("sync path {} value {}".format(path, value))
+
+        while not self.ctx.status.is_done():
+            self.client.put(path, six.b(value))
+
+            result = [i for i in self.client.get_prefix(prefix)]
+            result = copy.deepcopy(result)
+            self.ctx.logger.debug("sync peers {}".format(result))
+
+            if len(result) == size:
+                if rank < 0:
+                    keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys.sort()
+                    values = [six.ensure_str(i[0]) for i in result]
+                    ret = [values[keys.index(k)] for k in sorted_keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for v, k in result:
+                        ii = int(six.ensure_str(k.key).split('/')[-1])
+                        if ii < 0:
+                            self.ctx.logger.error(
+                                "rank {} error in sync".format(ii))
+                        ret[ii] = six.ensure_str(v)
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+
+    def register_heartbeat(self, job_id, pod_id, ttl=10):
+        if hasattr(self, 'heartbeat_prefix'):
+            self.ctx.logger.warning("Heartbeat already done")
+            return
+
+        self.job_prefix = '/paddle/{}'.format(job_id)
+        self.heartbeat_prefix = '{}/heartbeat'.format(self.job_prefix)
+
+        lease = self.client.lease(ttl)
+
+        #self.client.delete_prefix(self.job_prefix)
+
+        beat_path = "{}/{}".format(self.heartbeat_prefix, pod_id)
+        self.client.put(beat_path, six.b(pod_id), lease=lease)
+
+        def _beat_watch(event):
+            self.ctx.status.restart()
+
+        beat_watch = self.client.add_watch_prefix_callback(
+            self.heartbeat_prefix, _beat_watch)
+
+        def _heartbeat():
+            while not self.ctx.status.is_done():
+                try:
+                    lease.refresh()
+                    if pod_id not in self.fetch_peer_alive():
+                        self.client.put(beat_path, six.b(pod_id), lease=lease)
+                        self.ctx.logger.debug("Heartbeat register again")
+                except Exception as e:
+                    self.ctx.logger.error("Heartbeat error {}".format(e))
+                time.sleep(ttl / 2)
+            self.ctx.logger.debug("Heartbeat done")
+            self.client.cancel_watch(beat_watch)
+
+        self.beat_thread = threading.Thread(
+            name='heartbeat', target=_heartbeat, daemon=True)
+        self.beat_thread.start()
+
+    def fetch_peer_alive(self):
+        peer_alive = [
+            six.ensure_str(i[0])
+            for i in self.client.get_prefix(self.heartbeat_prefix)
+        ]
+        self.ctx.logger.debug("peer alive {}".format(peer_alive))
+        return peer_alive
+
+    def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        end = time.time() + timeout
+        while not self.ctx.status.is_done() and time.time() < end:
+            if len(self.fetch_peer_alive()) == replicas_max:
+                return (True, replicas_max)
+            else:
+                time.sleep(0.5)
+
+        np = len(self.fetch_peer_alive())
+        if np >= replicas_min and np <= replicas_max:
+            return (True, np)
+        else:
+            return (False, np)
+
+    def restart_peer(self):
+        self.client.delete_prefix(self.heartbeat_prefix)
+
+    def set_status(self, status):
+        assert self.client.put(
+            self.job_prefix, six.b(status),
+            lease=self.client.lease(600)), "set status failed {}".format(status)
+
+    def get_status(self):
+        return six.ensure_str(self.client.get(self.job_prefix)[0] or '')
+
+    def stop(self):
+        if hasattr(self, 'beat_thread'):
+            self.ctx.status.done()
+            # TODO(kuizhiqing) thread should exit
+            #self.beat_thread.join()
diff --git a/python/paddle/distributed/run/controllers/ps.py b/python/paddle/distributed/run/controllers/ps.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc43c336cf1862fe075e5a4463b1f5b666a5005c
--- /dev/null
+++ b/python/paddle/distributed/run/controllers/ps.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller, ControleMode
+
+import json
+import os, shutil
+
+
+class PSController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
+                ctx.args.servers) > 0:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.mode = ControleMode.PS
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        if self.ctx.args.servers and self.ctx.args.trainers:
+            self._build_pod_with_args()
+        else:
+            self._build_pod_with_master()
+
+    def _build_pod_with_args(self):
+        if '127.0.0.1' in self.ctx.args.servers:
+            host = '127.0.0.1'
+        else:
+            host = self.ctx.node.ip
+
+        server_endpoints = [s for s in self.ctx.args.servers.split(",")]
+        trainer_endpoints = [s for s in self.ctx.args.trainers.split(",")]
+        servers = [
+            s for s in self.ctx.args.servers.split(",") if s.startswith(host)
+        ]
+        trainers = [
+            s for s in self.ctx.args.trainers.split(",") if s.startswith(host)
+        ]
+        server_num = len(servers)
+        trainer_num = len(trainers)
+
+        self.pod.replicas = server_num + trainer_num
+
+        self.save_pod_log([server_endpoints, trainer_endpoints])
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = self.ctx.args.gloo_port
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.ctx.args.servers,
+                "PADDLE_TRAINER_ENDPOINTS": self.ctx.args.trainers,
+                "PADDLE_PORT": servers[i].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        trainer_rank_offset = 0
+        for s in trainer_endpoints:
+            if s.startswith(host):
+                break
+            else:
+                trainer_rank_offset += 1
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT": trainers[i].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+    def _build_pod_with_master(self):
+
+        self.pod.rank = self.ctx.args.rank
+
+        server_num = self.ctx.args.server_num or 1
+        servers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(server_num)
+        ]
+        trainer_num = self.ctx.args.trainer_num or 1
+        trainers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(trainer_num)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'servers': servers,
+            'trainers': trainers,
+            'dtype': self.ctx.node.device.dtype,
+            'gloo_port': self.ctx.node.get_free_port(),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.save_pod_log(peer_list)
+
+        server_endpoints = [j for i in peer_list for j in i['servers']]
+        trainer_endpoints = [j for i in peer_list for j in i['trainers']]
+        #rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+
+        server_rank_offset = sum([len(i['servers']) for i in peer_list[:rank]])
+        trainer_rank_offset = sum(
+            [len(i['trainers']) for i in peer_list[:rank]])
+
+        self.pod.rank = rank
+
+        self.pod.replicas = server_num + trainer_num
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = peer_list[0]['gloo_port']
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                server_endpoints[i + server_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                trainer_endpoints[i + trainer_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        ''' NEW VERSION
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "PSERVER",
+                "PADDLE_RANK": "{}".format(i + server_rank_offset),
+            }
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "TRAINER_CPU",
+                "PADDLE_RANK": "{}".format(i + trainer_rank_offset),
+            }
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        '''
diff --git a/python/paddle/distributed/run/job/__init__.py b/python/paddle/distributed/run/job/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d2abbce21ebc72cda5373a3d3a242c077beaa8
--- /dev/null
+++ b/python/paddle/distributed/run/job/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pod import Pod
+from .job import Job
+from .container import Container
+from .status import Status
+
+__all__ = [
+    'Pod',
+    'Job',
+    'Container',
+    'Status',
+]
diff --git a/python/paddle/distributed/run/job/container.py b/python/paddle/distributed/run/job/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..651932d6c88378034d7ab9cb05bac00ee3ea7ddf
--- /dev/null
+++ b/python/paddle/distributed/run/job/container.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from paddle.distributed.run.utils.process_context import ProcessContext
+
+from .status import Status
+
+import os, copy, sys
+import time
+
+
+class Container(object):
+    '''
+    TODO(kuizhiqing) A container can be run by process/thread or just a callable function
+    '''
+
+    def __init__(self, entrypoint=[], rank=-1, env={}):
+        self._entrypoint = entrypoint
+        self._rank = rank
+        self._out = None
+        self._err = None
+        self._env = env
+        self._proc = None
+
+        self._retry: int = 3
+        self._grace_period = 10
+
+        self._log_handler = None
+
+    @property
+    def entrypoint(self):
+        return self._entrypoint
+
+    @entrypoint.setter
+    def entrypoint(self, entry):
+        self._entrypoint = entry
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def outfile(self):
+        return self._out
+
+    @outfile.setter
+    def outfile(self, out):
+        self._out = out
+
+    @property
+    def errfile(self):
+        return self._err
+
+    @errfile.setter
+    def errfile(self, err):
+        self._err = err
+
+    def update_env(self, env={}, **kwargs):
+        env = {k: v for k, v in env.items() if isinstance(v, str)}
+        self._env.update(env)
+
+        kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
+        self._env.update(kwargs)
+
+    def _get_fd(self, pth):
+        if not pth:
+            return None
+
+        try:
+            d = os.path.dirname(pth)
+            if not os.path.isdir(d):
+                os.makedirs(d, exist_ok=True)
+            return open(pth, 'w')
+        except:
+            return None
+
+    def start(self, timeout=-1):
+        end = time.time() + timeout
+
+        if self._proc and self._proc.alive():
+            return True
+
+        self._stdout = self._get_fd(self._out) or sys.stdout
+        if self._out == self._err:
+            self._stderr = self._stdout
+        elif self._err:
+            self._stderr = self._get_fd(self._err) or sys.stderr
+
+        self._proc = ProcessContext(
+            self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
+        self._proc.start()
+
+        while timeout > 0 and time.time() < end:
+            if self._proc.alive():
+                time.sleep(0.1)
+                continue
+            if self._proc.exit_code() == 0:
+                return True
+            return False
+
+    def terminate(self, force=False):
+        if self._log_handler:
+            self._log_handler.close()
+            self._log_handler = None
+
+        if self._proc and self._proc.alive():
+            return self._proc.terminate(force)
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
+
+    def exit_code(self):
+        return self._proc.exit_code() if self._proc else -1
+
+    def status(self):
+        if not self._proc:
+            return Status.UNINIT
+        if self._proc.alive():
+            return Status.RUNNING
+        elif self._proc.exit_code() == 0:
+            return Status.COMPLETED
+        else:
+            return Status.FAILED
+
+    def __str__(self):
+        return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
+            self._rank,
+            self.status(),
+            self._entrypoint,
+            self.exit_code(),
+            self.errfile,
+            self._env, )
+
+    def logs(self, fn=None, offset=0, whence=1, lines=1000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        if fn is None:
+            fn = sys.stdout
+
+        self._log_handler.seek(offset, whence)
+
+        try:
+            idx = 0
+            for line in self._log_handler:
+                fn.write(line)
+                idx += 1
+                if idx > lines:
+                    break
+        finally:
+            return self._log_handler.tell()
+
+    def tail(self, length=3000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        self._log_handler.seek(0, 2)
+        ed = self._log_handler.tell()
+
+        if ed > length:
+            self.logs(offset=ed - length, whence=0)
+        else:
+            self.logs(offset=0, whence=0)
diff --git a/python/paddle/distributed/run/job/job.py b/python/paddle/distributed/run/job/job.py
new file mode 100644
index 0000000000000000000000000000000000000000..3469ed862576faed3bd7546710927f638b8fe0d5
--- /dev/null
+++ b/python/paddle/distributed/run/job/job.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class JobMode:
+    COLLECTIVE = 'collective'
+    PS = 'ps'
+    HETER = 'heter'
+
+
+class Job(object):
+    def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"):
+        self._mode = mode
+        self._id = id
+
+        self._replicas = 0
+        self._replicas_min = self._replicas
+        self._replicas_max = self._replicas
+        self._elastic = False
+
+        self.set_replicas(str(np))
+
+    def __str__(self):
+        return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
+            self.id, self.mode, self._replicas, self._replicas_min,
+            self._replicas_max, self.elastic)
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def elastic(self):
+        return self._elastic
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @property
+    def replicas_min(self):
+        return self._replicas_min
+
+    @property
+    def replicas_max(self):
+        return self._replicas_max
+
+    @replicas.setter
+    def replicas(self, replicas):
+        self._replicas = replicas
+
+    def set_replicas(self, np: str):
+        np = str(np) if np else '1'
+
+        if ':' in np:
+            nps = np.split(':')
+            self._replicas_min, self._replicas_max = int(nps[0]), int(nps[1])
+            self._replicas = self._replicas_max  # default to max
+
+            self._elastic = True
+        else:
+            self._replicas = int(np)
+            self._replicas_min, self._replicas_max = self._replicas, self._replicas
+
+            self._elastic = False
diff --git a/python/paddle/distributed/run/job/pod.py b/python/paddle/distributed/run/job/pod.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c31edce1d552befac3a6f54e5e79c326b31c67
--- /dev/null
+++ b/python/paddle/distributed/run/job/pod.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .container import Container
+
+from .status import Status
+
+import random
+import time
+
+
+class PodSepc(object):
+    def __init__(self):
+        self._name = ''.join(
+            random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6))
+
+        # by controller
+        self._init_containers: List[Container] = []
+        self._containers: List[Container] = []
+
+        #self.resource: Resource = None
+        #self.status: Status = None
+
+        self._rank = -1
+        self._init_timeout = 120  # 2 min timeout for each init container
+        self._restart = -1
+        self._replicas = 0  # number of containers
+        self._exit_code = 0
+
+
+class Pod(PodSepc):
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        return "Pod: {}, replicas {}, status {}".format(self.name,
+                                                        self.replicas,
+                                                        self.status())
+
+    def failed_container(self):
+        for c in self._containers:
+            if c.status() == Status.FAILED:
+                return c
+        return None
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @replicas.setter
+    def replicas(self, r):
+        self._replicas = r
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def restart(self):
+        return self._restart
+
+    @property
+    def containers(self):
+        return self._containers
+
+    def add_container(self, c):
+        c.rank = len(self._containers)
+        self._containers.append(c)
+
+    @property
+    def init_containers(self):
+        return self._init_containers
+
+    def add_init_container(self, c):
+        c.rank = len(self._init_containers)
+        self._init_containers.append(c)
+
+    @property
+    def exit_code(self):
+        for c in self._containers:
+            if c.exit_code() != 0:
+                return c.exit_code()
+        return 0
+
+    def deploy(self):
+        for i in self._init_containers:
+            i.start(self._init_timeout)
+
+        for c in self._containers:
+            c.start()
+
+        self._restart += 1
+
+    def stop(self, sigint=0):
+        for c in self._containers:
+            force = True if sigint == 9 else False
+            c.terminate(force)
+
+    def join(self):
+        for c in self._containers:
+            c.wait(None)
+
+    def status(self):
+        if self.is_failed():
+            return Status.FAILED
+
+        if self.is_completed():
+            return Status.COMPLETED
+
+        return Status.READY
+
+    def reset(self):
+        self._init_containers = []
+        self._containers = []
+
+    def is_failed(self):
+        for c in self._containers:
+            if c.status() == Status.FAILED:
+                return True
+        return False
+
+    def is_completed(self):
+        for c in self._containers:
+            if c.status() != Status.COMPLETED:
+                return False
+        return True
+
+    def logs(self, idx=None):
+        if idx is None:
+            if self.failed_container():
+                self.failed_container().logs()
+            else:
+                self._containers[0].logs()
+        else:
+            self._containers[idx].logs()
+
+    def tail(self, idx=None):
+        if idx is None:
+            if self.failed_container():
+                self.failed_container().tail()
+            else:
+                self._containers[0].tail()
+        else:
+            self._containers[idx].tail()
+
+    def watch(self,
+              all_list=[Status.COMPLETED],
+              any_list=[Status.FAILED],
+              interval=1,
+              timeout=-1):
+        '''
+        watch return if any container status in any_list
+        or all container status in all_list
+        '''
+        end = time.time() + timeout
+        while timeout < 0 or time.time() < end:
+            for c in self._containers:
+                if c.status() in any_list:
+                    return c.status()
+
+            s = [c.status() for c in self._containers]
+            if len(set(s)) == 1 and s[0] in all_list:
+                return s[0]
+
+            time.sleep(interval)
diff --git a/python/paddle/distributed/run/job/status.py b/python/paddle/distributed/run/job/status.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae10c5adb6cbfe4713370a2f01c74569bfe98182
--- /dev/null
+++ b/python/paddle/distributed/run/job/status.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
diff --git a/python/paddle/distributed/run/plugins/__init__.py b/python/paddle/distributed/run/plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec91402a7aad359c9860cf737a78cb7c1f1375d1
--- /dev/null
+++ b/python/paddle/distributed/run/plugins/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+__all__ = []
+
+
+def log(ctx):
+    ctx.logger.info("-----------  Configuration  ----------------------")
+    for arg, value in sorted(six.iteritems(vars(ctx.args))):
+        ctx.logger.info("%s: %s" % (arg, value))
+    ctx.logger.info("--------------------------------------------------")
+
+
+def process_args(ctx):
+    # reset device by args
+    #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
+    argdev = ctx.args.devices
+    if argdev:
+        ctx.node.device.labels = argdev.split(',')
+        ctx.node.device.count = len(ctx.node.device.labels)
+        ctx.logger.debug('Device reset by args {}'.format(argdev))
+
+
+def collective_compatible(ctx):
+    if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
+        ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0]
+    if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
+        ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0]
+
+
+def rewrite_host_ip(ctx):
+    if ctx.args.host is not None and "." in ctx.args.host:
+        ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host))
+        ctx.node.ip = ctx.args.host
+
+
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
diff --git a/python/paddle/distributed/run/plugins/ip.py b/python/paddle/distributed/run/plugins/ip.py
new file mode 100644
index 0000000000000000000000000000000000000000..0809ed5864da9f3bea29235621e7c29b75823391
--- /dev/null
+++ b/python/paddle/distributed/run/plugins/ip.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import socket
+
+
+def get_local_ip(ctx):
+    _, ip = _get_host_name_ip()
+    ctx.args.host = ip
+    ctx.envs["POD_IP"] = ip
+
+
+def _get_host_name_ip():
+    try:
+        host_name = socket.gethostname()
+        host_ip = socket.gethostbyname(host_name)
+        return host_name, host_ip
+    except:
+        return None
diff --git a/python/paddle/distributed/run/utils/kv_client.py b/python/paddle/distributed/run/utils/kv_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19195412268a5c309d65a4a61e005ea512d685b
--- /dev/null
+++ b/python/paddle/distributed/run/utils/kv_client.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+
+
+class KVClient(object):
+    def __init__(self, endpoint='localhost:2379'):
+        self.endpoint = endpoint if endpoint.startswith(
+            "http://") else "http://{}".format(endpoint)
+
+    def put(self, key, value):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.post(u, data=value, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def get(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                ret = r.json()
+                return ret.get(key, '')
+            else:
+                return "error"
+        except:
+            return ""
+
+    def get_prefix(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                return r.json()
+        except:
+            return ""
+
+    def delete(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.delete(u, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def wait_server_ready(self, timeout=3):
+        end = time.time() + timeout
+        while time.time() < end:
+            if self.get("/healthy") == "ok":
+                return True
+
+
+if __name__ == '__main__':
+    cli = PKVClient("http://localhost:8090")
+    data = {"/workers/1": "rank1", "/workers/2": "rank2"}
+    for k, v in data.items():
+        cli.put(k, v)
+    x = cli.get_prefix("/workers")
+    print(x)
+    for k, v in data.items():
+        assert x[k] == v
+
+    cli.put("key", "value")
+    print(cli.get("key"))
+    assert cli.get("key") == "value"
+    cli.delete("key")
+    print(cli.get("/key"))
+    print(cli.get("/healthy"))
+    assert cli.get("/healthy") == "ok"
diff --git a/python/paddle/distributed/run/utils/kv_server.py b/python/paddle/distributed/run/utils/kv_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d7ae15f13d636f05536dbdd8f35434bb7c3bf97
--- /dev/null
+++ b/python/paddle/distributed/run/utils/kv_server.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from http.server import HTTPServer
+import http.server as SimpleHTTPServer
+
+from multiprocessing import Process
+
+import threading
+import json
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        with self.server.kv_lock:
+            ret = {}
+            for k, v in self.server.kv.items():
+                if k.startswith(self.path):
+                    ret[k] = v.decode(encoding="utf-8")
+            if ret:
+                self.output(200, json.dumps(ret).encode("utf-8"))
+            else:
+                self.output(404)
+
+    def do_PUT(self):
+        self.do_POST()
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'] or 0)
+        try:
+            value = self.rfile.read(content_length)
+            with self.server.kv_lock:
+                self.server.kv[self.path] = value
+                self.output(200)
+                return
+        except:
+            self.output(500)
+
+    def do_DELETE(self):
+        with self.server.kv_lock:
+            if self.path in self.server.kv:
+                del self.server.kv[self.path]
+                self.output(200)
+            else:
+                self.output(404)
+
+    def output(self, code, value=''):
+        self.send_response(code)
+        self.send_header("Content-Length", len(value))
+        self.send_header("Content-Type", "application/json; charset=utf8")
+        self.end_headers()
+        if value:
+            self.wfile.write(value)
+
+    def log_message(self, format, *args):
+        return
+
+
+class KVServer(HTTPServer, object):
+    def __init__(self, port):
+        super(KVServer, self).__init__(('', port), KVHandler)
+        self.kv_lock = threading.Lock()
+        self.kv = {'/healthy': b'ok'}
+        self.port = port
+        self.stopped = False
+        self.started = False
+
+    def start(self):
+        self.listen_thread = threading.Thread(target=self.serve_forever)
+        self.listen_thread.start()
+        self.started = True
+
+    def stop(self):
+        self.shutdown()
+        self.listen_thread.join()
+        self.server_close()
+        self.stopped = True
+
+
+class PKVServer():
+    def __init__(self, port):
+        self._server = KVServer(port)
+
+    def start(self):
+        self.proc = Process(target=self._server.start)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def stop(self):
+        self._server.stop()
+        self.proc.join()
+
+    @property
+    def started(self):
+        return self._server.started
+
+    @property
+    def stopped(self):
+        return self._server.stopped
+
+
+if __name__ == '__main__':
+    #kv = PKVServer(8090)
+    kv = KVServer(8090)
+    kv.start()
+    import time
+
+    #print("serve at 8090 for 600 s")
+
+    time.sleep(600)
diff --git a/python/paddle/distributed/run/utils/process_context.py b/python/paddle/distributed/run/utils/process_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d6fa8de794ff07874cc788f0abf0a283c066ae7
--- /dev/null
+++ b/python/paddle/distributed/run/utils/process_context.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os, sys, signal, time
+
+
+class ProcessContext(object):
+    def __init__(self,
+                 cmd,
+                 env=os.environ,
+                 out=sys.stdout,
+                 err=sys.stderr,
+                 group=True,
+                 preexec_fn=None):
+        self._cmd = cmd
+        self._env = env
+        self._preexec_fn = preexec_fn
+        self._stdout = out
+        self._stderr = err
+        self._group = group if os.name != 'nt' else False
+        self._proc = None
+        self._code = None
+
+    def _start(self):
+        pre_fn = os.setsid if self._group else None
+        self._proc = subprocess.Popen(
+            self._cmd,
+            env=self._env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            preexec_fn=self._preexec_fn or pre_fn)
+
+    def _close_std(self):
+        try:
+            if not self._stdout.isatty():
+                self._stdout.close()
+
+            if not self._stderr.isatty():
+                self._stderr.close()
+        except:
+            pass
+
+    def alive(self):
+        return self._proc and self._proc.poll() is None
+
+    def exit_code(self):
+        return self._proc.poll() if self._proc else None
+
+    def start(self):
+        self._start()
+
+    def terminate(self, force=False, max_retry=3):
+        for i in range(max_retry):
+            if self.alive():
+                if self._group:
+                    os.killpg(os.getpgid(self._proc.pid), signal.SIGTERM)
+                else:
+                    self._proc.terminate()
+                time.sleep(0.2)
+            else:
+                break
+
+        if force and self.alive():
+            self._proc.kill()
+
+        self._close_std()
+
+        return self.alive()
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d14e3dd099ffe2e23e4d1d7ca3195e6a06ee3dce
--- /dev/null
+++ b/python/paddle/distributed/sharding/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .group_sharded import group_sharded_parallel, save_group_sharded_model  # noqa: F401
+
+__all__ = ['group_sharded_parallel', 'save_group_sharded_model']
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd4caa7b4a5c41e73fcf95ac50d0253bb3e7c79
--- /dev/null
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from enum import Enum
+
+import paddle
+
+from paddle.optimizer import Optimizer
+from paddle.distributed.utils import get_logger
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+logger_ = get_logger(logging.INFO)
+
+
+def group_sharded_parallel(model,
+                           optimizer,
+                           level,
+                           scaler=None,
+                           group=None,
+                           offload=False,
+                           sync_buffers=False,
+                           buffer_max_size=2**23,
+                           segment_size=2**20,
+                           sync_comm=False):
+    """
+    Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
+    Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
+
+    Args:
+        model (Layer): The layer to be wrapped with group_sharded_parallel.
+        optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
+        level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
+        scaler (GradScaler, optional): If AMP is used, you need to pass GradScaler. Defaults to None, indicating that GradScaler is not used.
+        group (Group, optional): The group instance. Defaults to None, indicating that the default environment group is used.
+        offload (bool, optional): Whether to use the offload function. Defaults to False, which means that the offload function is not used.
+        sync_buffers (bool, optional): Whether to broadcast model buffers. It is generally used when there are registered model buffers. Defaults to False, indicating that model buffers are not used.
+        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
+        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
+        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
+    
+    Returns:
+        model: A wrapper for group sharded given model.
+        optimizer: A wrapper for group sharded given optimizer.
+        scaler: A wrapper for group sharded given scaler.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+    """
+    # check optition type
+    assert isinstance(
+        model,
+        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+    assert isinstance(
+        optimizer, Optimizer
+    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
+    assert level in ['os', 'os_g', 'p_g_os'
+                     ], "The level must be os, os_g or p_g_os."
+
+    def check_dtype(param):
+        return param.dtype == paddle.float16
+
+    params_fp16 = list(filter(check_dtype, model.parameters()))
+    if scaler is None and len(params_fp16) > 0:
+        raise ValueError("Please enter the correct scaler.")
+    # convert model/optimizer/scaler
+    if level in ['os', 'os_g']:
+        logger_.info("*" * 30)
+        logger_.info("Sharded level os uses sharded level os_g achieved now.")
+        logger_.info("*" * 30)
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(),
+            optim=optimizer,
+            group=group,
+            offload=offload)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            buffer_max_size=buffer_max_size)
+    elif level == 'p_g_os':
+        model = ShardingStage3(
+            model,
+            optimizer=optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            segment_size=segment_size,
+            offload=offload,
+            sync_comm=sync_comm)
+    else:
+        raise ValueError("Please enter the correct level.")
+    if params_fp16 and isinstance(scaler, paddle.amp.GradScaler):
+        scaler = ShardingScaler(scaler)
+    logger_.info("*" * 30)
+    logger_.info(
+        "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
+    )
+    logger_.info("*" * 30)
+
+    return model, optimizer, scaler
+
+
+def save_group_sharded_model(model, output, optimizer=None):
+    """
+    Group sharded encapsulated model and optimizer state saving module.
+
+    .. note::
+        If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel.
+
+    Args:
+        model (Layer): A wrapper for group sharded given model.
+        output (str): Save directory.
+        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # save model and optimizer state_dict
+            save_group_sharded_model(model, optimizer, output=output_dir)
+    """
+    logger_.info(
+        "==========Begin to save group sharded model and optimizer==========")
+    assert not os.path.isfile(
+        output
+    ), "Saving directory ({}) should be a directory, not a file".format(output)
+    os.makedirs(output, exist_ok=True)
+    output_model = os.path.join(output, "model.pdmodel")
+    if isinstance(model, ShardingStage2):
+        paddle.save(model._layer.state_dict(), output_model)
+    elif isinstance(model, ShardingStage3):
+        convert2cpu = True if model._offload else False
+        model.get_all_parameters(convert2cpu=convert2cpu)
+        paddle.save(model._layer.state_dict(), output_model)
+    else:
+        raise ValueError(
+            "Please use the layer which is wrapped with group_sharded_parallel.")
+
+    if optimizer is not None:
+        assert hasattr(
+            optimizer, "_optim"
+        ), "Please use the optimizer which is wrapped with group_sharded_parallel."
+        output_opt = os.path.join(output, "model.pdopt")
+        paddle.save(optimizer._optim.state_dict(), output_opt)
+    logger_.info(
+        "==========End to save group sharded model and optimizer==========")
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 53f4a93f6480e84c2df7138e0d71d1832f7360d1..ae40a42e9d5074d112ebfcbf87ce3f272928d637 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -546,13 +546,15 @@ class Pod(object):
 
 def get_logger(log_level, name="root"):
     logger = logging.getLogger(name)
-    logger.setLevel(log_level)
-
-    log_handler = logging.StreamHandler()
-    log_format = logging.Formatter(
-        '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
-    log_handler.setFormatter(log_format)
-    logger.addHandler(log_handler)
+    # Avoid printing multiple logs
+    if not logger.handlers:
+        logger.setLevel(log_level)
+
+        log_handler = logging.StreamHandler()
+        log_format = logging.Formatter(
+            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
 
     return logger
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 997075590e5cf97241188b847c0c5b5036ecee59..7480909a2d88dda51971d0ef66ae6c88a56cd79c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -226,3 +226,5 @@ if core.is_compiled_with_npu():
     atexit.register(core.npu_finalize)
 # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
 atexit.register(core.clear_executor_cache)
+# NOTE(Aganlengzi): clean up KernelFactory in advance manually.
+atexit.register(core.clear_kernel_factory)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 9da798375af258633ea346d016fe99591f88e32d..97b4116826a2a0e809af4a4129a0e953fd607b07 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -272,7 +272,7 @@ class PostTrainingQuantization(object):
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
         self._support_algo_type = [
-            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+            'KL', 'hist', 'avg', 'mse', 'emd', 'abs_max', 'min_max'
         ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
@@ -349,7 +349,7 @@ class PostTrainingQuantization(object):
         # The vars for algo = avg
         self._quantized_var_avg = {}
         # The best loss of algo = mse
-        self._best_mse_loss = {}
+        self._best_calibration_loss = {}
         # The threshold for algo = abs_max, mse or avg
         self._quantized_threshold = {}
 
@@ -408,7 +408,7 @@ class PostTrainingQuantization(object):
                 np.array(self._quantized_var_avg[var_name]).mean()
         if self._algo in ["KL", "hist"]:
             self._calculate_kl_hist_threshold()
-        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse", "emd"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -582,6 +582,8 @@ class PostTrainingQuantization(object):
             self._sample_min_max()
         elif self._algo == "mse":
             self._sample_mse()
+        elif self._algo == "emd":
+            self._sample_emd()
         elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
@@ -610,8 +612,8 @@ class PostTrainingQuantization(object):
             abs_max_value = float(np.max(np.abs(var_tensor)))
             abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
             s = 0.3
-            if var_name not in self._best_mse_loss:
-                self._best_mse_loss[var_name] = float('inf')
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
             while s <= 1.0:
                 scale = s * abs_max_value
                 s += 0.02
@@ -620,8 +622,49 @@ class PostTrainingQuantization(object):
                     np.clip(var_tensor, 0.0, scale) / scale *
                     bins) / bins * scale
                 mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
-                if mse_loss <= self._best_mse_loss[var_name]:
-                    self._best_mse_loss[var_name] = mse_loss
+                if mse_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_emd(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("EMD searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
+            s = 0.3
+            if var_name not in self._best_calibration_loss:
+                self._best_calibration_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                emd_loss = np.abs(
+                    np.mean(var_tensor) - np.mean(quant_dequant_var)) + np.abs(
+                        np.std(var_tensor) - np.std(quant_dequant_var))
+                if emd_loss <= self._best_calibration_loss[var_name]:
+                    self._best_calibration_loss[var_name] = emd_loss
                     self._quantized_threshold[var_name] = scale
 
     def _sample_avg(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index f75a0fa50a59c1dd570f3b35ff5b3c9108564e78..49ae8f5fd56d375dd7d1b4b4d772892b7724b9b6 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -351,10 +351,12 @@ endif()
 
 set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
 set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
-set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
+set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
+
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
     set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index c4318b8bf8ef629d4bcba1f43be350298beff7da..7b9cd7958b2d3d0704a3770156d86f9cae44592a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -26,7 +26,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
+from paddle.fluid.framework import IrGraph, _test_eager_guard
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
         seed = 1000
         lr = 0.001
 
@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
-    def test_save_quantized_model(self):
+    def func_save_quantized_model(self):
         lr = 0.001
 
         load_param_path = "test_save_quantized_model/lenet.pdparams"
@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_save_quantized_model(self):
+        with _test_eager_guard():
+            self.func_save_quantized_model()
+        self.func_save_quantized_model()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
index fb92b12cb0d870d185b4c31a7bcdb1bebfe5b38d..fad4c8f9d580b389b861c4c9a992af0c48cd892d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -29,6 +29,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import *
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
 
 from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
 from imperative_test_utils import ImperativeLinearBn_hook
@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase):
                 break
         return top1_correct_num / total_num
 
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQfuse(TestImperativePTQ):
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQHist(TestImperativePTQ):
     def set_vars(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 677ccb52e242cf0c95a7b03acaefbf4d424ac401..5db720b028ffec8c1c02731a44c0a4466e827cc8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 paddle.enable_static()
@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase):
         self.activation_quantize_type = 'moving_average_abs_max'
         print('weight_quantize_type', self.weight_quantize_type)
 
-    def test_qat(self):
+    def func_qat(self):
         self.set_vars()
 
         imperative_qat = ImperativeQuantAware(
@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase):
                 np.allclose(after_save, before_save.numpy()),
                 msg='Failed to save the inference quantized model.')
 
+    def test_qat(self):
+        with _test_eager_guard():
+            self.func_qat()
+        self.func_qat()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index d1bf76f472465f2803cd95ad1dc468bbc5289051..2dcf7a6f168e20393a3e1a3432ac75b652e2a063 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -27,7 +27,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 os.environ["CPU_NUM"] = "1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 270e8ee566ab57c4e244e1bbb52a8c3d9a41db52..0bc80694a12cb6a9ff2f857444a06e7764038e36 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D
 from paddle.fluid.dygraph import Linear
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
-
+from paddle.fluid.framework import _test_eager_guard
 os.environ["CPU_NUM"] = "1"
 
 _logger = get_logger(
@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
         _logger.info("test act_preprocess")
         self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
 
-    def test_quant_aware_training(self):
+    def func_quant_aware_training(self):
         imperative_qat = self.imperative_qat
         seed = 1
         np.random.seed(seed)
@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
         train(lenet)
         test(lenet)
 
+    def test_quant_aware_training(self):
+        with _test_eager_guard():
+            self.func_quant_aware_training()
+        self.func_quant_aware_training()
+
 
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
     def setUp(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 8d2e0f753c0167295dd18c0266b613e963de5d8a..d77134d72a9596956f063612bc16e6a37f81037a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+from paddle.fluid.framework import _test_eager_guard
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
@@ -42,7 +43,8 @@ _logger = get_logger(
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
+        paddle.disable_static()
         seed = 1000
         lr = 0.1
 
@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase):
         if find_matmul:
             self.assertTrue(matmul_skip_count == 1)
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index da5c5d6dc9441bbddd5104a1b9b2aa94880fda21..4b70f5b103778baa631cb55e4277fa1bc0fdca65 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -244,6 +244,26 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
                       quant_iterations)
 
 
+class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "emd"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
     def test_post_training_avg(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 71611048610060df736dd11bc6da7f6816cfd4b6..f83306aca1dc018838cf10f7084141682c8fff38 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -394,5 +394,27 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
                       diff_threshold)
 
 
+class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "emd"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 937fcdf0463beed7d9116be1a4800a0d02238e7d..ffa12ac70460084fd49a14d0193be6e913495b9a 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -155,8 +155,7 @@ def prune_model(main_program=None,
                 n=2,
                 m=4,
                 mask_algo='mask_1d',
-                with_mask=True,
-                sharding=False):
+                with_mask=True):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
     specified mask generation function given by :attr:`mask_algo`. This 
@@ -179,7 +178,6 @@ def prune_model(main_program=None,
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
-        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -221,7 +219,10 @@ def prune_model(main_program=None,
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
             sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
-    if sharding:
+    if main_program is not None and hasattr(
+            main_program,
+            "distributed_info_") and main_program.distributed_info_[
+                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
         gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
         place = paddle.CUDAPlace(gpu_id)
     else:
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 5e023e9248cab5cc00b7912f23188f58ccbc8eb0..617ab6305289f9b07a8131b54a8d588abd42a911 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -283,6 +283,7 @@ if avx_supported():
         from .core_avx import _set_cached_executor_build_strategy
         from .core_avx import _device_synchronize
         from .core_avx import _get_current_stream
+        from .core_avx import _Profiler, _ProfilerResult, _RecordEvent
         from .core_avx import _set_current_stream
         if sys.platform != 'win32':
             from .core_avx import _set_process_pids
@@ -344,6 +345,7 @@ if load_noavx:
         from .core_noavx import _device_synchronize
         from .core_noavx import _get_current_stream
         from .core_noavx import _set_current_stream
+        from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent
         if sys.platform != 'win32':
             from .core_noavx import _set_process_pids
             from .core_noavx import _erase_process_pids
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 3debeecfe4f38209ea9080a1e9899d632ee55bfe..3a23c852563daacb54ebbf2fa642b9126e53fb3b 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -113,8 +113,6 @@ class BatchSampler(Sampler):
             assert not shuffle, "shuffle should be False when sampler is set"
             self.sampler = sampler
         else:
-            assert isinstance(dataset, Dataset), \
-                "dataset should be a paddle.io.Dataset"
             assert not isinstance(dataset, IterableDataset), \
                 "dataset should not be a paddle.io.IterableDataset"
             assert sampler is None, \
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f43a51063b00ac0439aacfbf46ff593e7b1b4f43..191661b7bf9d5a5b2877f22ebe6d2ec9124f2f96 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -411,9 +411,9 @@ def amp_decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -426,7 +426,7 @@ def amp_decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8c2ff140ea4d5531a0ab6e284b1661573d9a2670..8149d69d36a27fadcefa8dc6b6ff1dd89792e29e 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -99,18 +99,19 @@ def param_guard(parameters):
         yield
 
 
-def _convert_into_variable(var_base):
+def _convert_into_variable(tensor):
     """
     Convert Varbase into Variable.
     """
-    if isinstance(var_base, core.VarBase):
+    if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
         # Check whether has been created before.
-        new_var = var_base.block._find_var_recursive(var_base.name)
+        new_var = tensor.block._find_var_recursive(tensor.name)
         if new_var is not None:
             assert isinstance(new_var, framework.Variable)
         # Convert ParamBase into Parameter with same attributes in dy2stat.
-        elif isinstance(var_base, framework.ParamBase):
-            new_var = var_base._to_static_var(to_parameter=True)
+        elif isinstance(tensor,
+                        (framework.EagerParamBase, framework.ParamBase)):
+            new_var = tensor._to_static_var(to_parameter=True)
         else:
             # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
             # same attributes and set persistable=True to allow saving this var.
@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
 
             # But if its shape is empty while created from `create_variable()`, we consider this buffer
             # non-persistable. See case of `drop_state` in lstm api.
-            is_persistable = len(var_base.shape) > 0
+            is_persistable = len(tensor.shape) > 0
 
-            new_var = var_base._to_static_var(
+            new_var = tensor._to_static_var(
                 to_parameter=False, persistable=is_persistable)
         return new_var
     else:
-        return var_base
+        return tensor
 
 
 def enabled():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 30012fb8666fcb5256efa889de7440f6d709cccd..900541459f6fcd3f8caaf9d60b0aabba5c6c469e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -104,7 +104,7 @@ class FunctionSpec(object):
             if isinstance(input_var, np.ndarray):
                 input_var = paddle.static.InputSpec.from_numpy(input_var)
                 _set_spec_stop_gradient(input_var, True)
-            elif isinstance(input_var, core.VarBase):
+            elif isinstance(input_var, (core.VarBase, core.eager.Tensor)):
                 stop_gradient = input_var.stop_gradient
                 input_var = paddle.static.InputSpec.from_tensor(input_var)
                 _set_spec_stop_gradient(input_var, stop_gradient)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 94fc5558ab162636e59a5569904d770970f812d1..216f955b7510351c2dc6774a34a485b2341e76aa 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -61,7 +61,8 @@ class NestSequence(object):
     def _get_var_ids(self):
         var_ids = []
         for idx, var in enumerate(self.__input_list):
-            if isinstance(var, (framework.Variable, core.VarBase)):
+            if isinstance(var, (framework.Variable, core.VarBase,
+                                core.eager.Tensor)):
                 var_ids.append(idx)
 
         return var_ids
@@ -73,7 +74,8 @@ class NestSequence(object):
         if need_check:
             warning_types = set()
             for var in self.__input_list:
-                if not isinstance(var, (framework.Variable, core.VarBase)):
+                if not isinstance(var, (framework.Variable, core.VarBase,
+                                        core.eager.Tensor)):
                     warning_types.add(type(var))
             if warning_types:
                 logging_utils.warn(
@@ -146,10 +148,7 @@ class PartialProgramLayer:
 
         self._origin_main_program = self._verify_program(main_program)
         self._tmp_scope_vec = self._create_scope_vec()
-        # A fake_var to handle empty input or output
-        self.__fake_vars = _create_fake_var()
         # Set default mode to train
-        self._double_grads = self._get_double_grads(self._origin_main_program)
         self.training = True
 
         custom_white_list, custom_black_list = None, None
@@ -161,6 +160,14 @@ class PartialProgramLayer:
             custom_white_list=custom_white_list,
             custom_black_list=custom_black_list)
 
+    @LazyInitialized
+    def __fake_vars(self):
+        return _create_fake_var()
+
+    @LazyInitialized
+    def _double_grads(self):
+        return self._get_double_grads(self._origin_main_program)
+
     @LazyInitialized
     def _infer_program(self):
         """
@@ -301,10 +308,17 @@ class PartialProgramLayer:
             for name in block.vars:
                 if "@GRAD" in name:
                     var_desc = block.vars[name].desc
-                    var_base = core.VarBase(var_desc.dtype(),
-                                            var_desc.shape(),
-                                            var_desc.name(),
-                                            var_desc.type(), False)
+                    var_base = None
+                    if not core._in_eager_mode():
+                        var_base = core.VarBase(var_desc.dtype(),
+                                                var_desc.shape(),
+                                                var_desc.name(),
+                                                var_desc.type(), False)
+                    else:
+                        var_base = core.eager.Tensor(var_desc.dtype(),
+                                                     var_desc.shape(),
+                                                     var_desc.name(),
+                                                     var_desc.type(), False)
                     double_grads.append(var_base)
         return self._valid_vars(double_grads)
 
@@ -347,8 +361,10 @@ class PartialProgramLayer:
 
     def drop_scope_if_no_grad(self):
         tracer = framework._dygraph_tracer()
+        scope = self._tmp_scope_vec.value().get_scope() if isinstance(
+            self._tmp_scope_vec, (core.VarBase)) else self._tmp_scope_vec[0]
         if self.training and not tracer._has_grad:
-            self._tmp_scope_vec.value().get_scope().drop_kids()
+            scope.drop_kids()
 
     @property
     def program(self):
@@ -386,13 +402,22 @@ class PartialProgramLayer:
         expected_place = framework._current_expected_place()
         for i, value in enumerate(flatten_inputs):
             if isinstance(value, np.ndarray):
-                var = core.VarBase(
-                    value=value,
-                    name=self._inputs[i].desc.name(),
-                    persistable=False,
-                    place=expected_place,
-                    zero_copy=True)
-            elif isinstance(value, core.VarBase):
+                var = None
+                if not core._in_eager_mode():
+                    var = core.VarBase(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+                else:
+                    var = core.eager.Tensor(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+            elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                 # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                 # into CUDAPlace when it's as input of multi Ops. so we move it in advance
                 # to avoid this problem.
@@ -411,9 +436,16 @@ class PartialProgramLayer:
             var = self._outputs[var_id]
             assert isinstance(var, framework.Variable)
             var_desc = var.desc
-            var_base = core.VarBase(var_desc.dtype(),
-                                    var_desc.shape(),
-                                    var_desc.name(), var_desc.type(), False)
+            varbase = None
+            if not core._in_eager_mode():
+                var_base = core.VarBase(var_desc.dtype(),
+                                        var_desc.shape(),
+                                        var_desc.name(), var_desc.type(), False)
+            else:
+                var_base = core.eager.Tensor(var_desc.dtype(),
+                                             var_desc.shape(),
+                                             var_desc.name(),
+                                             var_desc.type(), False)
             return var_base
 
         # Create VarBase to receive output data.
@@ -423,12 +455,15 @@ class PartialProgramLayer:
 
     def _create_scope_vec(self):
         # Hold forward variables
-        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                     "program_out_scope",
-                                     core.VarDesc.VarType.STEP_SCOPES, True)
-
+        tmp_scope_vec = None
         inner_scope = core.Scope()
-        tmp_scope_vec.value().set_scope(inner_scope)
+        if not core._in_eager_mode():
+            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                         "program_out_scope",
+                                         core.VarDesc.VarType.STEP_SCOPES, True)
+            tmp_scope_vec.value().set_scope(inner_scope)
+        else:
+            tmp_scope_vec = [inner_scope]
         return tmp_scope_vec
 
     def _restore_out(self, out_vars):
@@ -450,7 +485,8 @@ class PartialProgramLayer:
         return main_program.clone(for_test=True)
 
     def _is_no_value(self, var):
-        if isinstance(var, core.VarBase) and var.shape == [1]:
+        if isinstance(var,
+                      (core.VarBase, core.eager.Tensor)) and var.shape == [1]:
             # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
             if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
                 return True
@@ -460,7 +496,7 @@ class PartialProgramLayer:
         """
         Removes invalid value for various-length return statement
         """
-        if isinstance(out_vars, core.VarBase):
+        if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
             if self._is_no_value(out_vars):
                 return None
             return out_vars
@@ -527,7 +563,7 @@ class PartialProgramLayer:
         param_and_buffer_names_set = set()
         for i, var in enumerate(self._params):
             # self._params constains parameters and buffers with persistable=True.
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                 raise TypeError(
                     'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
                     format(i, type(var)))
@@ -559,10 +595,16 @@ def _create_fake_var():
     """
     Create a fake_var (force on CPU) to handle empty input or output
     """
-    return [
-        core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
-                     core.VarDesc.VarType.RAW, False)
-    ]
+    if not core._in_eager_mode():
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
 
 
 def partial_program_from(concrete_program):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 04474dcdfe5091b2986eeeedb9870c00d83970db..d440e387da597d20300e50d7862dab80fb161c2a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -191,7 +191,7 @@ def is_api_in_module(node, module_prefix):
 
         return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
                                                                 module_prefix))
-    except NameError:
+    except Exception:
         return False
 
 
@@ -227,7 +227,7 @@ def is_numpy_api(node):
         # TODO: find a better way
         if not module_result:
             return func_str.startswith("numpy.") or func_str.startswith("np.")
-    except NameError:
+    except Exception:
         return False
 
 
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index aad7737350961b34badb89b0b8f542d0037b8c98..f58952d3036c506341955eff2472079bb696bb1f 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -885,7 +885,7 @@ def _run_dygraph(instance, input, program_holder):
             'start_op_index': 0,
             'end_op_index': end_op_index,
             'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program)
+            'program_id': _hash_with_id(trace_program, instance)
         })
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4bfdc3c27fad628bba3fd16237c12d3ca43244d7..b1865691b2475c4f855f51244e627965047d7720 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -25,7 +25,7 @@ import threading
 
 import six
 import paddle
-from paddle.fluid import core
+from paddle.fluid import core, dygraph
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten, pack_sequence_as
@@ -898,30 +898,33 @@ def save(layer, path, input_spec=None, **configs):
                 state_var_dict[var.name] = var
 
             # 3. share parameters from Layer to scope & record var info
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
-                    scr_tensor = param_or_buffer.value().get_map_tensor()
-                    tgt_var = scope.var(param_or_buffer.name)
-                    tgt_var.set_vocab(scr_tensor)
-                else:
-                    param_or_buffer_tensor = scope.var(
-                        param_or_buffer.name).get_tensor()
-                    #src_tensor = param_or_buffer.value().get_tensor()
-                    src_tensor = state_var_dict[param_or_buffer.name].value(
-                    ).get_tensor()
-                    param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                if param_or_buffer.name not in extra_var_info:
-                    extra_info_dict = dict()
-                    if param_or_buffer.name in state_names_dict:
-                        extra_info_dict['structured_name'] = state_names_dict[
-                            param_or_buffer.name]
-                    extra_info_dict[
-                        'stop_gradient'] = param_or_buffer.stop_gradient
-                    if isinstance(param_or_buffer, ParamBase):
-                        extra_info_dict['trainable'] = param_or_buffer.trainable
-                    extra_var_info[param_or_buffer.name] = extra_info_dict
+            with dygraph.guard():
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
+                        scr_tensor = param_or_buffer.value().get_map_tensor()
+                        tgt_var = scope.var(param_or_buffer.name)
+                        tgt_var.set_vocab(scr_tensor)
+                    else:
+                        param_or_buffer_tensor = scope.var(
+                            param_or_buffer.name).get_tensor()
+                        #src_tensor = param_or_buffer.value().get_tensor()
+                        src_tensor = state_var_dict[param_or_buffer.name].value(
+                        ).get_tensor()
+                        param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    if param_or_buffer.name not in extra_var_info:
+                        extra_info_dict = dict()
+                        if param_or_buffer.name in state_names_dict:
+                            extra_info_dict[
+                                'structured_name'] = state_names_dict[
+                                    param_or_buffer.name]
+                        extra_info_dict[
+                            'stop_gradient'] = param_or_buffer.stop_gradient
+                        if isinstance(param_or_buffer, ParamBase):
+                            extra_info_dict[
+                                'trainable'] = param_or_buffer.trainable
+                        extra_var_info[param_or_buffer.name] = extra_info_dict
 
         # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 0049f387b707fc853699474b34235f177d4672af..86d76f1b20a74c9f8bb51e23d9fc7d450717f173 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
 from ..layers import collective
 from paddle.fluid.dygraph import base as imperative_base
-from paddle.fluid.framework import ParamBase
+from paddle.fluid.framework import ParamBase, _in_eager_mode
 
 __all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
@@ -128,6 +128,9 @@ class ParallelEnv(object):
         elif core.is_compiled_with_npu():
             selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
             self._device_id = int(selected_npus[0])
+        elif core.is_compiled_with_mlu():
+            selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
+            self._device_id = int(selected_mlus[0])
 
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
@@ -394,6 +397,16 @@ def sync_params_buffers(model,
                    'axis': 0})
 
 
+@imperative_base.no_grad
+@framework.dygraph_only
+def sync_eager_params(model, comm_group=None, src_rank=0):
+    for _, param in model._obtain_parameters_buffers().items():
+        if not isinstance(param, core.eager.Tensor):
+            raise TypeError("The data type of '%s' must be '%s'" %
+                            (param.name, core.eager.Tensor))
+        comm_group.broadcast(param, src_rank).synchronize()
+
+
 class DataParallel(layers.Layer):
     """
     Run the dygraph module with data parallelism.
@@ -573,6 +586,7 @@ class DataParallel(layers.Layer):
         self.process_group = process_group
         self.gradient_as_buffer_view = gradient_as_buffer_view
         self.static_graph = static_graph
+        self.var_dtype = core.eager.Tensor if _in_eager_mode() else core.VarBase
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
@@ -589,11 +603,20 @@ class DataParallel(layers.Layer):
             "ParallelContext must be initialized before. You should use init_parallel_env() before" \
             "constructing the DataParallel."
 
+            if self.process_group is None and _in_eager_mode():
+                raise RuntimeError(
+                    "Process group should be built in DataParallel of eager mode."
+                )
+
             # sync buffer and params
             # TODO(liuyuhui) Currently not support xpu. xpu is 
             # still broadcasting parameters when calling layer
             if not paddle.is_compiled_with_xpu():
-                sync_params_buffers(self._layers)
+                if _in_eager_mode():
+                    sync_eager_params(
+                        self._layers, comm_group=self.process_group)
+                else:
+                    sync_params_buffers(self._layers)
 
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
@@ -617,9 +640,9 @@ class DataParallel(layers.Layer):
                 if param is None or param in params_set:
                     continue
                 params_set.add(param)
-                if not isinstance(param, core.VarBase):
-                    raise TypeError("The data type of '%s' must be Varbase" %
-                                    param.name)
+                if not isinstance(param, self.var_dtype):
+                    raise TypeError("The data type of '%s' must be '%s'" %
+                                    (param.name, self.var_dtype))
                 if param.trainable:
                     layers_param.append((sublayer, param))
 
@@ -646,19 +669,32 @@ class DataParallel(layers.Layer):
             check_layer_sparse(sublayer) for sublayer, _ in layers_param
         ]
 
-        self.group_indices = core.assign_group_by_size(
-            trainable_parameters, is_sparse_gradient,
-            [self.last_comm_buffer_size, self.comm_buffer_size])
+        if _in_eager_mode():
+            self.group_indices = core.eager_assign_group_by_size(
+                trainable_parameters, is_sparse_gradient,
+                [self.last_comm_buffer_size, self.comm_buffer_size])
+
+            self._reducer = core.EagerReducer(
+                trainable_parameters,
+                list(reversed(self.group_indices)), is_sparse_gradient,
+                self.process_group,
+                [self.last_comm_buffer_size, self.comm_buffer_size],
+                self.find_unused_parameters)
+        else:
+            self.group_indices = core.assign_group_by_size(
+                trainable_parameters, is_sparse_gradient,
+                [self.last_comm_buffer_size, self.comm_buffer_size])
 
-        self._reducer = core.Reducer(
-            trainable_parameters,
-            list(reversed(self.group_indices)), is_sparse_gradient,
-            parallel_helper.__parallel_ctx__clz__,
-            [self.last_comm_buffer_size, self.comm_buffer_size],
-            self.find_unused_parameters)
+            self._reducer = core.Reducer(
+                trainable_parameters,
+                list(reversed(self.group_indices)), is_sparse_gradient,
+                parallel_helper.__parallel_ctx__clz__,
+                [self.last_comm_buffer_size, self.comm_buffer_size],
+                self.find_unused_parameters)
 
     def _find_varbase(self, obj):
-        if isinstance(obj, core.VarBase):
+        var_type = core.eager.Tensor if _in_eager_mode() else core.VarBase
+        if isinstance(obj, var_type):
             return [obj]
         if isinstance(obj, (list, tuple)):
             return itertools.chain(*map(self._find_varbase, obj))
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index e0c594b07aeb519dcb3906cdfc03d9af92117059..d0552ca41f0daf56ce23317dd06cb5744baaff84 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -29,6 +29,35 @@ final_state_name_mapping = {
         "x": "X",
         "y": "Y",
         "out": "Out",
+    },
+    "trunc": {
+        "final_op_name": "final_state_trunc",
+        "x": "X",
+        "out": "Out",
+    },
+    "abs": {
+        "final_op_name": "final_state_abs",
+        "x": "X",
+        "out": "Out",
+    },
+    "digamma": {
+        "final_op_name": "final_state_digamma",
+        "x": "X",
+        "out": "Out",
+    },
+    "diagonal": {
+        "final_op_name": "final_state_diagonal",
+        "x": "Input",
+        "offset": "offset",
+        "axis1": "axis1",
+        "axis2": "axis2",
+        "out": "Out",
+    },
+    "one_hot": {
+        "final_op_name": "final_state_one_hot",
+        "x": "X",
+        "num_class": "depth",
+        "out": "Out",
     }
 }
 
@@ -117,7 +146,12 @@ class Tracer(core.Tracer):
                             outputs[retname][j].reconstruct_from_(returns[i][j],
                                                                   False)
                     else:
-                        outputs[retname][0].reconstruct_from_(returns[i], False)
+                        if isinstance(outputs[retname], list):
+                            outputs[retname][0].reconstruct_from_(returns[i],
+                                                                  False)
+                        else:
+                            outputs[retname].reconstruct_from_(returns[i],
+                                                               False)
         elif isinstance(returns, list):
             assert len(outputs.keys()) == 1
             key = list(outputs.keys())[0]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 65bfba3f6c32e072a6db0e1d294a8c5fc07d9d74..2b67a2029727f6b8f917239094a1b906d5cd6a62 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -94,7 +94,7 @@ def monkey_patch_varbase():
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T']
-        if isinstance(self, ParamBase):
+        if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
         else:
             attr_names = []
@@ -111,7 +111,7 @@ def monkey_patch_varbase():
 
         attr_kwargs.update(kwargs)
 
-        if to_parameter or isinstance(self, ParamBase):
+        if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
             del attr_kwargs['persistable']
             # NOTE(Aurelius84): All parameters should be placed into global block.
             attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
@@ -311,7 +311,7 @@ def monkey_patch_varbase():
 
         """
         if core._in_eager_mode():
-            if not self.grad._is_initialized():
+            if self.grad is None:
                 return None
             # TODO(wanghuancoder) support SELECTED_ROWS
             return self.grad.numpy()
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 4bbc0ba03c9342afc4a0d2edee6c2b963ad6e0f8..a48cfd9150c657784f46fc6316a797c767fd5dd4 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
 @static_only
 def _legacy_save(param_dict, model_path, protocol=2):
     def get_tensor(var):
-        if isinstance(var, core.VarBase):
+        if isinstance(var, (core.VarBase, core.eager.Tensor)):
             return var.numpy()
         elif isinstance(var, core.LoDTensor):
             return np.array(var)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 56af7e341fdd61558265b1ee4ec1eff0c0f916fa..676ee3e3c774eab144468ce42f3c839daa948e4d 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -20,7 +20,7 @@ import string
 
 from six.moves import cStringIO
 from ..proto import framework_pb2
-from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode
+from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode, _in_eager_mode
 from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype
 from paddle import _C_ops
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f022e1791daefb7cbe18434aae8ac1dbc63d39c5..fd7226c48661fdb2cd4dcf7227d0f8383c6c9439 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
         'flatten')
+    if in_dygraph_mode():
+        return _C_ops.flatten2(x, 'axis', axis)[0]
+
     helper = LayerHelper('flatten', **locals())
 
     if not (isinstance(x, Variable)):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 76414ea942465d1e7a54084a2e4ee31b9ee41a2d..c63ad42288fd057d8456f31a675c9f1912bdc12f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -663,7 +663,9 @@ def assign(input, output=None):
             })
 
     if is_inplace and in_dygraph_mode():
-        output._bump_inplace_version()
+        # TODO(jiabin): Remove this when we support inplace
+        if not core._in_eager_mode():
+            output._bump_inplace_version()
 
     return output
 
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 727ceca72d1f1cfc0c34dae4e516568052136ba4..cbea289162c849b0efdca0ebfc48e0151dc1213e 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -332,8 +332,6 @@ class DataLoader(object):
         self.use_buffer_reader = use_buffer_reader
         self.worker_init_fn = worker_init_fn
 
-        assert isinstance(dataset, Dataset), \
-            "dataset should be subclass instance of paddle.io.Dataset"
         self.dataset = dataset
 
         if not return_list and not in_dygraph_mode():
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c89990be34ca06c6277033bb6b6c0844e7d9a327..acaf7cb74280bed23b20feab2a96aa85a9bb5cea 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -153,6 +153,7 @@ PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward)
     .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape));
 
 void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
+  out->reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
@@ -164,6 +165,7 @@ void relu_cpu_backward_out(const paddle::Tensor& x,
                            const paddle::Tensor& out,
                            const paddle::Tensor& grad_out,
                            paddle::Tensor* grad_x) {
+  grad_x->reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 33c5ede299bd47c87490473920bb80b18cd75bf5..4bb773cdaec21712f262bcb217710f6909efd20a 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -94,6 +94,7 @@ void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
   int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
+  out->reshape(x.shape());
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
@@ -108,6 +109,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
   int numel = out.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
+  grad_x->reshape(x.shape());
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
index 1c9c6eedbaeb8c1c3f06d42d82a8ec5cc28750f6..785bfc74229817c022f7f9e80481dde156d4e178 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
@@ -20,6 +20,7 @@ import paddle
 from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -53,7 +54,7 @@ class TestJitCustomAttrs(unittest.TestCase):
         self.int64_vec_attr = [10000000000, 10000000000, 10000000000]
         self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"]
 
-    def test_attr_value(self):
+    def func_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.attr_test(
@@ -65,7 +66,12 @@ class TestJitCustomAttrs(unittest.TestCase):
 
         self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
 
-    def test_const_attr_value(self):
+    def test_attr_value(self):
+        with _test_eager_guard():
+            self.func_attr_value()
+        self.func_attr_value()
+
+    def func_const_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.const_attr_test(
@@ -77,6 +83,11 @@ class TestJitCustomAttrs(unittest.TestCase):
 
         self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
 
+    def test_const_attr_value(self):
+        with _test_eager_guard():
+            self.func_const_attr_value()
+        self.func_const_attr_value()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index 9049b604c910c80a41afa2c509e2ec3fdb4ffbfc..62e61c5bc7f5f235c03d146bd77fee41948a2a05 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -21,6 +21,7 @@ import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -116,7 +117,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
             "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
                                                            pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             for axis in self.axises:
                 out, grad_inputs = concat_dynamic(custom_ops.custom_concat,
@@ -128,6 +129,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static(self):
         for dtype in self.dtypes:
             for axis in self.axises:
@@ -140,7 +146,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
                 self.check_output(x1_grad, pd_x1_grad, "x1_grad")
                 self.check_output(x2_grad, pd_x2_grad, "x2_grad")
 
-    def test_dynamic_with_attr(self):
+    def func_dynamic_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
                 out, grad_inputs = concat_dynamic(
@@ -153,6 +159,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    def test_dynamic_with_attr(self):
+        with _test_eager_guard():
+            self.func_dynamic_with_attr()
+        self.func_dynamic_with_attr()
+
     def test_static_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
index 25c88ee6c6b01daf62553f5f7857bfe06fce25ca..5f3c107a9b22ad2014bd5e2488c0f48a6866fad8 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -21,6 +21,7 @@ import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -116,11 +117,16 @@ class TestCustomConjJit(unittest.TestCase):
         self.check_output(out, pd_out, "out")
         self.check_output(x_grad, pd_x_grad, "x's grad")
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
             self.run_dynamic(dtype, np_input)
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
index 0ba70eaa3e06cec619b2a9175db4aa1c8bf75a8b..811eedf1edaf39c961f9fd292054c2cce5154db9 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -22,6 +22,7 @@ import paddle.nn.functional as F
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -94,7 +95,7 @@ class TestCustomLinearJit(unittest.TestCase):
                                    self.np_bias)
             self.check_output(pten_out, pd_out, "pten_out")
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for dtype in self.dtypes:
             pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x,
                                       self.np_weight, self.np_bias)
@@ -102,6 +103,11 @@ class TestCustomLinearJit(unittest.TestCase):
                                     self.np_bias)
             self.check_output(pten_out, pd_out, "pten_out")
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
index 207ea87974130698b8bf491bce3cd753045a97e9..4da99b1ea10418c6cb6baddb51596b307c6ba28d 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
@@ -68,12 +68,6 @@ class TestCustomRawReluOp(unittest.TestCase):
         self.assertTrue(custom_raw_relu_op is not None)
         return custom_raw_relu_op(x)
 
-    def test_dygraph(self):
-        x = paddle.to_tensor(np.random.uniform(low=-1.0, high=1.0, size=[2, 3]))
-        y1 = self.custom_raw_relu(x)
-        y2 = paddle.nn.ReLU()(x)
-        self.assertTrue(np.array_equal(y1.numpy(), y2.numpy()))
-
     def test_static(self):
         paddle.enable_static()
         shape = [2, 3]
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index dddb14eb78e8a1dc6a0820ead2ebfa915b8a09c2..81793f1391d0422393c1f6c1e719f708112d3b6b 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -22,6 +22,7 @@ from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -98,7 +99,7 @@ class TestDygraphModel(unittest.TestCase):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, self.in_dim], dtype='float32', name='x')
 
-    def test_train_eval(self):
+    def func_train_eval(self):
         for device in self.devices:
             # set device
             paddle.set_device(device)
@@ -106,26 +107,34 @@ class TestDygraphModel(unittest.TestCase):
             # for train
             origin_relu_train_out = self.train_model(use_custom_op=False)
             custom_relu_train_out = self.train_model(use_custom_op=True)
-            custom_relu_dy2stat_train_out = self.train_model(
-                use_custom_op=True, dy2stat=True)  # for to_static
+            # open this when dy2stat is ready for eager 
+            if not _in_eager_mode():
+                custom_relu_dy2stat_train_out = self.train_model(
+                    use_custom_op=True, dy2stat=True)  # for to_static
+                self.assertTrue(
+                    np.array_equal(origin_relu_train_out,
+                                   custom_relu_dy2stat_train_out))
 
             self.assertTrue(
                 np.array_equal(origin_relu_train_out, custom_relu_train_out))
-            self.assertTrue(
-                np.array_equal(origin_relu_train_out,
-                               custom_relu_dy2stat_train_out))
 
             # for eval
             origin_relu_eval_out = self.eval_model(use_custom_op=False)
             custom_relu_eval_out = self.eval_model(use_custom_op=True)
-            custom_relu_dy2stat_eval_out = self.eval_model(
-                use_custom_op=True, dy2stat=True)  # for to_static
+            if not _in_eager_mode():
+                custom_relu_dy2stat_eval_out = self.eval_model(
+                    use_custom_op=True, dy2stat=True)  # for to_static
+                self.assertTrue(
+                    np.array_equal(origin_relu_eval_out,
+                                   custom_relu_dy2stat_eval_out))
 
             self.assertTrue(
                 np.array_equal(origin_relu_eval_out, custom_relu_eval_out))
-            self.assertTrue(
-                np.array_equal(origin_relu_eval_out,
-                               custom_relu_dy2stat_eval_out))
+
+    def test_train_eval(self):
+        with _test_eager_guard():
+            self.func_train_eval()
+        self.func_train_eval()
 
     def train_model(self, use_custom_op=False, dy2stat=False):
         # reset random seed
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 407eb342ba99ba09d0f6faa8686e12c2a1100cdb..a747d10823ec5572e73e7ec8ae3e5da528e3e88d 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -20,7 +20,7 @@ from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC
 from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
-
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
@@ -75,7 +75,7 @@ class TestJITLoad(unittest.TestCase):
                         "custom op out: {},\n paddle api out: {}".format(
                             out, pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -95,8 +95,14 @@ class TestJITLoad(unittest.TestCase):
                         "custom op x grad: {},\n paddle api x grad: {}".format(
                             x_grad, pd_x_grad))
 
-    def test_exception(self):
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
+    def func_exception(self):
         caught_exception = False
+        # if not _in_eager_mode():
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x)
@@ -114,11 +120,11 @@ class TestJITLoad(unittest.TestCase):
                     "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
                     str(e))
         self.assertTrue(caught_exception)
-
         caught_exception = False
         # MAC-CI don't support GPU
         if IS_MAC:
             return
+        # if not _in_eager_mode():
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
             custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
@@ -132,6 +138,11 @@ class TestJITLoad(unittest.TestCase):
                 str(e))
         self.assertTrue(caught_exception)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     def test_load_multiple_module(self):
         custom_module = load(
             name='custom_conj_jit',
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 0af0aa16466ea82eeb4a9558bdbcb3de69489bb4..7c61e11a18ecd2ebbcd87fae37a8ba0a39ad56d1 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -21,6 +21,7 @@ import paddle.static as static
 import subprocess
 import numpy as np
 from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.framework import _test_eager_guard
 
 
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
@@ -216,7 +217,7 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                         "custom op out: {},\n paddle api out: {}".format(
                             out, pd_out))
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
@@ -236,6 +237,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                         "custom op x grad: {},\n paddle api x grad: {}".format(
                             x_grad, pd_x_grad))
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
         np_data = np.random.random((1, 1, 28, 28)).astype("float32")
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
index c60bac4060b643c01be87d82c1fcde4a8ae4be7e..f68a37b1a2f3b87f4126953ae50464d5ad8e6fe3 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
@@ -20,6 +20,7 @@ import paddle
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -39,7 +40,7 @@ custom_ops = load(
 
 
 class TestCustomSimpleSliceJit(unittest.TestCase):
-    def test_slice_output(self):
+    def func_slice_output(self):
         np_x = np.random.random((5, 2)).astype("float32")
         x = paddle.to_tensor(np_x)
         custom_op_out = custom_ops.custom_simple_slice(x, 2, 3)
@@ -48,6 +49,11 @@ class TestCustomSimpleSliceJit(unittest.TestCase):
             np.array_equal(custom_op_out, np_out),
             "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy()))
 
+    def test_slice_output(self):
+        with _test_eager_guard():
+            self.func_slice_output()
+        self.func_slice_output()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index 12e9f50a5e4092a067c533bcdb6bcb03011d35fa..0d2cb941eafaa188c7be458b08f0f5ef35ab6238 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -19,7 +19,7 @@ import numpy as np
 from paddle.utils.cpp_extension import load, get_build_directory
 from utils import paddle_includes, extra_cc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-
+from paddle.fluid.framework import _test_eager_guard
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\dispatch_op\\dispatch_op.pyd'.format(get_build_directory())
@@ -39,7 +39,7 @@ class TestJitDispatch(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
 
-    def run_dispatch_test(self, func, dtype):
+    def run_dispatch_test_impl(self, func, dtype):
         np_x = np.ones([2, 2]).astype(dtype)
         x = paddle.to_tensor(np_x)
         out = func(x)
@@ -50,6 +50,11 @@ class TestJitDispatch(unittest.TestCase):
             np.array_equal(np_x, np_out),
             "custom op x: {},\n custom op out: {}".format(np_x, np_out))
 
+    def run_dispatch_test(self, func, dtype):
+        with _test_eager_guard():
+            self.run_dispatch_test_impl(func, dtype)
+        self.run_dispatch_test_impl(func, dtype)
+
     def test_dispatch_integer(self):
         dtypes = ["int32", "int64", "int8", "uint8", "int16"]
         for dtype in dtypes:
diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
index 97b37498c4d3d0a6b2c336c240aaf116b11e0407..4fc9270b0f44cc5778775bdab4c2b7cab95c8c3a 100644
--- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
@@ -22,7 +22,7 @@ from paddle.utils.cpp_extension import load
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args
-
+from paddle.fluid.framework import _test_eager_guard
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\multi_out_jit\\multi_out_jit.pyd'.format(get_build_directory())
@@ -84,7 +84,7 @@ class TestMultiOutputDtypes(unittest.TestCase):
                 self.check_multi_outputs(res)
         paddle.disable_static()
 
-    def test_dynamic(self):
+    def func_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 paddle.set_device(device)
@@ -95,6 +95,11 @@ class TestMultiOutputDtypes(unittest.TestCase):
                 self.assertTrue(len(outs) == 3)
                 self.check_multi_outputs(outs, True)
 
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_dynamic()
+        self.func_dynamic()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1443eebf29384799154ac88083bd511bea6daf71..cbe360f556cd986646bc7f45b3a80ab0f5edb9eb 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,8 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_attention)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -46,6 +48,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
+list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -125,6 +128,17 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+endif()
+
+if (WITH_GPU)
+    if (CUDA_VERSION LESS 11.6)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+    endif()
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -270,6 +284,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -542,6 +557,7 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset)
+  list(REMOVE_ITEM TEST_OPS test_paddle_multiprocessing)
 endif()
 
 if (NOT WITH_GLOO)
@@ -933,6 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
 endif()
 
 # setting timeout value as 15S
+set_tests_properties(test_run PROPERTIES TIMEOUT 200)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
@@ -1102,15 +1119,16 @@ set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
@@ -1139,6 +1157,8 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
+        set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel_fused_attention PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
@@ -1156,6 +1176,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
             test_collective_global_scatter
             PROPERTIES LABELS "RUN_TYPE=DIST")
     endif()
+    set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
index 26170015ae8c249fb3a36d13285f5b34491acb3a..d9ddd6c88d727a4cca5e94cf19b122355f3ea6c5 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
         exe.run(startup_prog)
 
-        sparsity.prune_model(train_prog, sharding=True)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 0a9eaf34ba512b3dd8649453e6e8d4ed25154c89..1f7ae53acdf4536921ca25e27874279f271b4de8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -5,7 +5,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
-    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
+    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
+    set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+    py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
+    set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34f267b4237bf5ebe19adda1c90f1c147294333
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.distributed.auto_parallel.converter import Converter
+
+
+def test_convert():
+    rank_id = paddle.distributed.get_rank()
+    complete_tensor = np.arange(64).reshape([8, 8])
+    tensor_row = np.split(complete_tensor, 2, axis=0)
+    tensor_col = np.split(complete_tensor, 2, axis=1)
+    tensor_name = "tensor_0"
+    complet_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [-1, -1]
+        }
+    }
+    row_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [0, -1]
+        }
+    }
+    col_strategy = {
+        tensor_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [-1, 0]
+        }
+    }
+
+    # test merge
+    tensor_dict = {tensor_name: tensor_row}
+    converter = Converter(tensor_dict, row_strategy, complet_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], complete_tensor).all()
+
+    # test slice
+    tensor_dict = {tensor_name: [complete_tensor]}
+    converter = Converter(tensor_dict, complet_strategy, col_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], tensor_col[rank_id]).all()
+
+    # test merge and slice
+    tensor_dict = {tensor_name: tensor_col}
+    converter = Converter(tensor_dict, col_strategy, row_strategy)
+    convert_tensor_dict = converter.convert()
+    assert np.equal(convert_tensor_dict[tensor_name], tensor_row[rank_id]).all()
+
+    # test merge and slice with prefix match
+    new_name = "tensor_1"
+    row_strategy = {
+        new_name: {
+            "process_shape": [2],
+            "process_group": [0, 1],
+            "dims_mapping": [0, -1]
+        }
+    }
+    converter = Converter(tensor_dict, col_strategy, row_strategy)
+    convert_tensor_dict = converter.convert(strict=False)
+    assert np.equal(convert_tensor_dict[new_name], tensor_row[rank_id]).all()
+
+
+if __name__ == "__main__":
+    test_convert()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c71c792bf07d0ade5bb024d8087407cde010a6f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+PP_MESH_0 = auto.ProcessMesh([0])
+PP_MESH_1 = auto.ProcessMesh([1])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = auto.shard_op(
+            self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0]
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = auto.shard_op(
+            self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0]
+        out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+def train():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+
+    dataset = MyDataset(batch_num * batch_size)
+    data_spec = [
+        InputSpec([batch_size, hidden_size], 'float32', 'x'),
+        InputSpec([batch_size], 'int64', 'label')
+    ]
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    engine = Engine(mlp, data_spec, strategy=dist_strategy)
+    engine.prepare(optimizer, loss)
+    engine.fit(dataset,
+               batch_size=batch_size,
+               steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbadbb7d8c1cfe8bb92e0287694d48a0a546f206
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+from paddle.distributed.auto_parallel.converter import Converter
+
+
+class TestConverter(unittest.TestCase):
+    def test_converter(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "converter.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+    def test_input_invalid(self):
+        with self.assertRaises(ValueError):
+            Converter({}, [], [])
+        with self.assertRaises(TypeError):
+            Converter([0, 1], [], [])
+        with self.assertRaises(ValueError):
+            Converter({"tmp_0": [0]}, {}, [])
+        with self.assertRaises(TypeError):
+            Converter({"tmp_0": [0]}, [0], [])
+
+        strategy_1 = {
+            'tmp_0': {
+                "process_shape": [1],
+                "process_group": [0],
+                "dims_mapping": [-1]
+            }
+        }
+        with self.assertRaises(TypeError):
+            Converter({"tmp_0": [0]}, strategy_1, [])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index 0fc1ea41033e00543054aa82949a583c6b0cf00f..d150da761aad3de3ab09f257d3b638cf37c27996 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -13,122 +13,35 @@
 # limitations under the License.
 
 import unittest
-import time
-import paddle.fluid as fluid
-import copy
 import os
-import numpy as np
+import sys
+import shutil
 import subprocess
-import paddle
-import paddle.nn as nn
-import paddle.fluid as fluid
-import paddle.static as static
-import paddle.nn.functional as F
-import paddle.utils as utils
-from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
-
-paddle.enable_static()
-global_process_mesh = auto.ProcessMesh(mesh=[0])
-batch_size = 1
-batch_num = 10
-hidden_size = 1024
-sequence_len = 512
-image_size = hidden_size
-class_num = 10
-
-paddle.seed(44)
-
-
-class MyDataset(Dataset):
-    def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
-        self.num_samples = num_samples
-
-    def __getitem__(self, index):
-        input = np.random.uniform(size=image_size).astype("float32")
-        label = np.random.randint(0, class_num - 1, dtype="int64")
-        return input, label
-
-    def __len__(self):
-        return self.num_samples
-
-
-class MLPLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
-        super(MLPLayer, self).__init__()
-        d_model = hidden_size
-        dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
-        bias_attr = None
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
-        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
-
-    def forward(self, input):
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mappig": [-1]
-            })
-        # out = self.norm(input)
-        out = self.linear0(input)
-        out = F.gelu(out, approximate=True)
-        out = self.linear1(out)
-        # out = self.dropout(out)
-        out = self.linear2(out)
-        return out
+from paddle.distributed.fleet.launch_utils import run_with_coverage
 
 
 class TestEngineAPI(unittest.TestCase):
     def test_engine_api(self):
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
-        loss = paddle.nn.CrossEntropyLoss()
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "engine_api.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
 
-        dataset = MyDataset(batch_num * batch_size)
-        data_spec = [
-            InputSpec([batch_size, hidden_size], 'float32', 'x'),
-            InputSpec([batch_size], 'int64', 'label')
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
         ]
 
-        dist_strategy = fleet.DistributedStrategy()
-        dist_strategy.amp = False
-        dist_strategy.pipeline = False
-        dist_strategy.recompute = False
-        # init parallel optimizer
-        dist_strategy.semi_auto = True
-        fleet.init(is_collective=True, strategy=dist_strategy)
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
 
-        engine = Engine(mlp, data_spec, strategy=dist_strategy)
-        engine.prepare(optimizer, loss)
-        engine.fit(dataset,
-                   batch_size=batch_size,
-                   steps_per_epoch=batch_num * batch_size)
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7104f9ef641c00af91f461495eae0caa3c7cd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+
+
+class TestTunableSpace(unittest.TestCase):
+    def test_fixed(self):
+        space = ts.TunableSpace()
+        fixed = space.fixed("fixed", default=4)
+        self.assertEqual(space.values["fixed"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+        space.values["fixed"] = 2
+        self.assertEqual(space.get_value("fixed"), 2)
+        self.assertEqual(space.values, {"fixed": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+    def test_boolean(self):
+        space = ts.TunableSpace()
+        boolean = space.boolean("boolean")
+        self.assertEqual(space.values["boolean"], False)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+        space.values["boolean"] = True
+        self.assertEqual(space.get_value("boolean"), True)
+        self.assertEqual(space.values, {"boolean": True})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+    def test_choice(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        space.values["choice"] = 2
+        self.assertEqual(space.get_value("choice"), 2)
+        self.assertEqual(space.values, {"choice": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+    def test_int_range(self):
+        space = ts.TunableSpace()
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+        space.values["int_range"] = 3
+        self.assertEqual(space.get_value("int_range"), 3)
+        self.assertEqual(space.values, {"int_range": 3})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_float_range(self):
+        space = ts.TunableSpace()
+        float_range = space.float_range(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        self.assertEqual(space.values["float_range"], 2.0)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+        space.values["float_range"] = 3.0
+        self.assertEqual(space.get_value("float_range"), 3.0)
+        self.assertEqual(space.values, {"float_range": 3.0})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+    def test_varibles(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 2)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_not_populated_variable(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=2)
+        self.assertEqual(choice, 2)
+
+    def test_populated_variable(self):
+        space = ts.TunableSpace()
+        space.values["choice"] = 2
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(choice, 2)
+
+        space["choice"] = 3
+        self.assertNotEqual(space.values["choice"], 2)
+        self.assertEqual(space.values["choice"], 3)
+
+    def test_state(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+
+        new_space = space.from_state(space.get_state())
+        self.assertEqual(new_space.get_value("choice"), 4)
+        self.assertEqual(new_space.get_value("int_range"), 2)
+        self.assertEqual(len(new_space.variables), 2)
+        self.assertEqual(len(new_space.values), 2)
+
+        self.assertEqual(new_space.variables["choice"].name, "choice")
+        self.assertEqual(new_space.variables["choice"].default, 4)
+        self.assertEqual(new_space.variables["choice"].values, [1, 2, 3, 4])
+
+        self.assertEqual(new_space.variables["int_range"].name, "int_range")
+        self.assertEqual(new_space.variables["int_range"].default, 2)
+        self.assertEqual(new_space.variables["int_range"].start, 1)
+        self.assertEqual(new_space.variables["int_range"].stop, 4)
+        self.assertEqual(new_space.variables["int_range"].step, 1)
+        self.assertEqual(new_space.variables["int_range"].endpoint, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..c36fca7a9d09a6fb15664226ac0be441fbf49c3e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_variable as tv
+
+
+class TestTunableVariable(unittest.TestCase):
+    def test_fixed(self):
+        fixed = tv.Fixed("fixed", True)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, True)
+        self.assertEqual(fixed.random(), True)
+
+        fixed = tv.Fixed("fixed", 1)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, 1)
+        self.assertEqual(fixed.random(), 1)
+
+    def test_boolean(self):
+        boolean = tv.Boolean("bool")
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, False)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+        boolean = tv.Boolean("bool", True)
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, True)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+    def test_choice(self):
+        choice = tv.Choice("choice", [1, 2, 3, 4])
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 1)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+        choice = tv.Choice("choice", [1, 2, 3, 4], default=2)
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 2)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+    def test_int_range(self):
+        int_range = tv.IntRange("int_range", start=1, stop=4, default=2)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 2)
+        self.assertIn(int_range.random(), [1, 2, 3, 4])
+        self.assertIn(int_range.random(1234), [1, 2, 3, 4])
+        self.assertNotEqual(int_range.default, 4)
+
+        int_range = tv.IntRange(
+            "int_range", start=1, stop=8, step=2, default=3, endpoint=True)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 3)
+        self.assertIn(int_range.random(), [1, 3, 5, 7])
+        self.assertIn(int_range.random(1234), [1, 3, 5, 7])
+        self.assertNotEqual(int_range.default, 2)
+
+    def test_float_range(self):
+        float_range = tv.FloatRange(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 2.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLess(float_range.random(1234), 4.4)
+        self.assertNotAlmostEqual(float_range.random(), 1)
+        self.assertNotAlmostEqual(float_range.random(), 4.4)
+
+        float_range = tv.FloatRange(
+            "float_range",
+            start=0.4,
+            stop=8.4,
+            step=2.0,
+            default=3.0,
+            endpoint=True)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 3.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLessEqual(float_range.random(1234), 8.4)
+        self.assertNotAlmostEqual(float_range.random(), 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 2277c69674b3faf4f2fddc43b8032152a465fd42..22692fa5debfccea65da32837819818023eaf80b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -32,6 +32,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer
 from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint, load_checkpoint_into_program
 from paddle.distributed.auto_parallel.utils import get_dist_attr, merge_and_slice_parameter, load_parameter_into_program
 from paddle.distributed.auto_parallel.reshard import HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -185,6 +186,7 @@ class TestMLPAutoConvert(unittest.TestCase):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_mp2pp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
@@ -211,6 +213,7 @@ class TestMLPAutoConvert(unittest.TestCase):
                           fetch_list=[loss])
         last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
         global PP_MESH_0
@@ -266,6 +269,7 @@ class TestMLPAutoConvert2(unittest.TestCase):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_pp2mp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
@@ -302,6 +306,7 @@ class TestMLPAutoConvert2(unittest.TestCase):
         if paddle.distributed.get_rank() in [1]:
             last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
 
@@ -345,6 +350,7 @@ class TestMLPAutoConvertInvalid(unittest.TestCase):
         np.random.seed(2021)
 
     def test_input_invalid(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index fd558ef0403296167d6c2a01b738428db90026b6..877136cf6ed0eca04a7fb5907c9b05139f597a60 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -146,9 +146,13 @@ class TestPsTrainerPass(PsPassTestBase):
         self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_old_minimize"
+        remove_path_if_exists(self.config['log_dir'])
         self.ps_launch("gpu-ps")
 
         self.config['debug_new_minimize'] = '1'
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_new_minimize"
+        remove_path_if_exists(self.config['log_dir'])
         self.ps_launch("gpu-ps")
 
         file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4832782c329afe1813e643bee10e57194883894
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import shutil
+import tempfile
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+epoch = 10
+paddle.seed(2022)
+np.random.seed(2022)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+batch_size = 100
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.Momentum(
+        parameters=[{
+            "params": list(model.parameters())
+        }] if opt_group else list(model.parameters()),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model, shard_level, use_pure_fp16, output_dir):
+    group = paddle.distributed.new_group([0, 1])
+
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+
+    model, optimizer, scaler = group_sharded_parallel(
+        model=model, optimizer=optimizer, level=shard_level, scaler=scaler)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+
+            optimizer.clear_grad()
+
+    save_group_sharded_model(model, output=output_dir, optimizer=optimizer)
+    return model.parameters()
+
+
+def test_sharding_api():
+    mlp, mlp1, mlp2 = MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+
+    output_dir = tempfile.mkdtemp()
+
+    # fp16
+    stage2_params = train_mlp(
+        mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir)
+    stage3_params = train_mlp(
+        mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir)
+
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage3_params[i].numpy(),
+            rtol=1e-4,
+            atol=1e-3)
+    shutil.rmtree(output_dir)
+
+
+if __name__ == '__main__':
+    test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 06935e212c3cb19519c558042cea3210910a8975..fb01fd46c0d28455067fd44139e177a81b25566a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -14,8 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
 import numpy as np
 import argparse
+import tempfile
 import ast
 import time
 import paddle
@@ -88,7 +91,8 @@ def train_mlp(model,
               batch_size=100,
               use_pure_fp16=False,
               accumulate_grad=False,
-              opt_group=False):
+              opt_group=False,
+              save_model=False):
     if sharding_stage == "dp":
         hcg = fleet.get_hybrid_communicate_group()
         group = hcg.get_check_parallel_group()
@@ -147,6 +151,9 @@ def train_mlp(model,
         if accumulate_grad:
             optimizer.step()
             optimizer.clear_grad()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
@@ -158,11 +165,13 @@ def test_dp_stage2():
     mlp3 = MLP()
     mlp4 = MLP()
     mlp5 = MLP()
+    mlp6 = MLP()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
     mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
 
     # DP VS stage2
     dp_params = train_mlp(
@@ -186,10 +195,29 @@ def test_dp_stage2():
 
     # stage2 param list VS param group
     stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+        mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True)
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
             dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage2, optimizer_stage2 = train_mlp(
+        mlp6,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage2.state_dict(), model_file)
+    paddle.save(optimizer_stage2.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage2.set_state_dict(m_state_dict)
+    optimizer_stage2.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index 6b755cf4c2b593e68c89decba0eeeaf601a37573..82821cd7ee644b5209a594e9a43de7636cdd4958 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -14,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
+import tempfile
 import numpy as np
 import argparse
 import ast
@@ -83,8 +86,9 @@ def train_mlp(model,
               accumulate_grad=False,
               batch_size=100,
               opt_group=False,
-              recompute=False,
-              test_minimize=False):
+              sync_comm=False,
+              test_minimize=False,
+              save_model=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
         optimizer = optimizer_setting(
@@ -104,7 +108,7 @@ def train_mlp(model,
             model, optimizer, group=group, buffer_max_size=2**21)
     elif sharding_stage == 3:
         model = ShardingStage3(
-            model, optimizer=optimizer, group=group, sync_comm=recompute)
+            model, optimizer=optimizer, group=group, sync_comm=sync_comm)
 
     # check optimizer.minimize() error
     if test_minimize:
@@ -162,12 +166,15 @@ def train_mlp(model,
             optimizer.clear_grad()
     if sharding_stage == 3:
         model.get_all_parameters()
+
+    if save_model:
+        return model, optimizer
     return model.parameters()
 
 
 def test_stage2_stage3():
-    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9 = MLP(), MLP(
-    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
     state_dict = mlp.state_dict()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
@@ -178,6 +185,7 @@ def test_stage2_stage3():
     mlp7.set_state_dict(state_dict)
     mlp8.set_state_dict(state_dict)
     mlp9.set_state_dict(state_dict)
+    mlp10.set_state_dict(state_dict)
 
     # fp32 
     stage2_params = train_mlp(
@@ -225,7 +233,7 @@ def test_stage2_stage3():
             rtol=1e-4,
             atol=1e-3)
 
-    # fp16 recompute
+    # fp16 sync_comm
     stage3_params = train_mlp(
         mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
     stage3_params_re = train_mlp(
@@ -233,14 +241,32 @@ def test_stage2_stage3():
         sharding_stage=3,
         use_pure_fp16=True,
         opt_group=False,
-        recompute=True)
+        sync_comm=True)
     for i in range(len(stage3_params)):
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
 
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage3, optimizer_stage3 = train_mlp(
+        mlp9,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        opt_group=False,
+        save_model=True)
+    paddle.save(model_stage3.state_dict(), model_file)
+    paddle.save(optimizer_stage3.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage3.set_state_dict(m_state_dict)
+    optimizer_stage3.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
     # check optimizer.minimize() error
     train_mlp(
-        mlp9,
+        mlp10,
         sharding_stage=3,
         use_pure_fp16=False,
         opt_group=False,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 30c1955adcf9f652e77b53eca8375955fb26eb35..c6f491a5484d9f1601993e1dd581e46008f0e27a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -520,6 +520,7 @@ def predict_static(args, data):
     paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
+
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
          args.model_save_dir,
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 959700ad743b40420200b56055354279386a9a7c..79a2430a161703348824d8e4e687bf85569c408a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -4,5 +4,11 @@ if(WITH_IPU)
 
     foreach(TEST_OP ${TEST_OPS})
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+        # set all UTs timeout to 200s
+        set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)    
     endforeach(TEST_OP)
+
+    set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
deleted file mode 100644
index 58a88c113fc0b6b82c1c58d50a1b0824cb530632..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.nn.functional as F
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestRelu(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_training()
-        self.init_op()
-
-    def init_op(self):
-        self.op = paddle.fluid.layers.relu
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = self.op(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IpuCompiler(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def run_test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
-
-    def test_case0(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
-        self.attrs = {}
-        self.set_feed_attr()
-        self.run_test_base()
-
-
-class TestTanh(TestRelu):
-    def init_op(self):
-        self.op = F.tanh
-
-
-class TestLog(TestRelu):
-    def init_op(self):
-        self.op = paddle.fluid.layers.log
-
-
-class TestSigmoid(TestRelu):
-    def init_op(self):
-        self.op = F.sigmoid
-
-
-class TestSqrt(TestRelu):
-    def init_op(self):
-        self.op = paddle.fluid.layers.sqrt
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 1dab958c1ecbc806df94c651cf4a2d6cd82f3ddb..c640cd441f1b2589bcc3ffa466b865d1fb34c582 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -115,7 +115,7 @@ class TestBase(IPUOpTest):
 
 class TestCase1(TestBase):
     def set_atol(self):
-        self.atol = 1e-7
+        self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
@@ -129,7 +129,7 @@ class TestCase1(TestBase):
 
 class TestCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-7
+        self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index 8b1560edfd81de65f495a1bd0609bfa09c5f7810..e34da7f70167a601e4cdc0ed48beebb261935d8f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
             "dropout_implementation": "downgrade_in_infer"
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 dropout = paddle.fluid.layers.dropout(x, **self.attrs)
                 out = paddle.fluid.layers.elementwise_add(dropout, dropout)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
@@ -115,7 +117,7 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.0,
             "is_test": False,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index 07b06d77c90ffb41d3a221c94d52bdc2c7d1a3a5..a9d6d2308326eaab8bc9fd15f3799c6f8cdd505d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -16,14 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,101 +27,136 @@ class TestMul(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        if IPUOpTest.use_ipumodel():
+            return False
+        else:
+            return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mul
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = self.op(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
     def test_case0(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2, 3, 4, 5))
+
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.attrs = {}
         self.set_feed_attr()
         self.run_test_base()
 
     def test_case1(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(3, 4)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(3, 4))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 1}
         self.run_test_base()
 
     def test_case2(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(5))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": -1}
         self.run_test_base()
 
     def test_case3(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 0}
@@ -134,37 +164,43 @@ class TestMul(IPUOpTest):
 
 
 class TestAdd(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_add
 
 
 class TestSub(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_sub
 
 
 class TestDiv(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_div
 
 
 class TestMin(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_min
 
 
 class TestMax(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_max
 
 
 class TestPow(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_pow
 
 
 class TestMod(TestMul):
-    def init_op(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mod
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index c319894bfae250789bf9931343ac29106629a148..5b18c73851324fc66666a885a25797dd15731d76 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,94 +26,106 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.zeros([1, 10]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                # XX
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.equal(x, y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.ones([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.arange(0, 10).reshape([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.arange(0, 10).reshape([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 5b7ea61568ecd5772d574dd3cc63fac74535c903..966dfdef87b54cd1e37bd9405c6b354a6f264363 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,142 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 3, 1]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"expand_times": [1, 2, 2]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
+
                 out = paddle.fluid.layers.expand(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 2]).astype('float32')}
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                expand_times = fluid.layers.fill_constant(
+                    dtype="float32")
+
+                expand_times = paddle.fluid.layers.fill_constant(
                     shape=[len(self.feed_shape[0])], dtype="int32", value=2)
                 out = paddle.fluid.layers.expand(
                     x, expand_times=expand_times, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b855a5a7a4297275171f288ddc68a61a561745
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 0.3, 'dtype': 'float32'}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                x_fill = paddle.full_like(x, **self.attrs)
+                out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 3, 'dtype': 'int32'}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index c62e0c08f9c79c2ee1217b2b74df84c26cf30f1f..3a1c202bf1133aecbe1bf2954b3bf8d5916776ab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,21 +26,23 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
@@ -54,33 +50,34 @@ class TestBase(IPUOpTest):
             'value': 0.3,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.fluid.layers.fill_constant(**self.attrs)
                 out = paddle.fluid.layers.elementwise_add(x, x)
-
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -89,19 +86,18 @@ class TestBase(IPUOpTest):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0cafc66805e75d239e149c2595197d004652c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -0,0 +1,118 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 1
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.flatten(x=x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
+
+        self.check(output_dict, check_shape=True)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 0
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd29ff705b88f012d434d4578816a112dcbd81a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import shutil
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['path'] = 'model'
+        self.attrs['model_name'] = 'test'
+
+    def _test_save(self):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
+        self.full_name = '/'.join(
+            [self.attrs['path'], self.attrs['model_name']])
+
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
+                with paddle.static.program_guard(main_prog, startup_prog):
+                    x = paddle.static.data(
+                        name=self.feed_list[0],
+                        shape=self.feed_shape[0],
+                        dtype='float32')
+
+                    scale = paddle.fluid.layers.scale(
+                        x, scale=1.0, bias=0.0, bias_after_scale=True)
+                    conv = paddle.static.nn.conv2d(
+                        scale,
+                        num_filters=3,
+                        filter_size=3,
+                        bias_attr=False,
+                        name='conv2d')
+                    loss = paddle.mean(conv)
+
+                    if self.attrs['is_training']:
+                        if self.attrs['opt_type'] == 'sgd':
+                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
+                            sgd.minimize(loss)
+                        elif self.attrs['opt_type'] == 'adam':
+                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                            adam.minimize(loss)
+                        elif self.attrs['opt_type'] == 'lamb':
+                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
+                            lamb.minimize(loss)
+
+                fetch_list = [loss.name]
+
+                place = paddle.IPUPlace()
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(
+                        self.feed_list, fetch_list)
+
+                for _ in range(self.attrs['steps']):
+                    exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list)
+
+                paddle.static.save_inference_model(
+                    self.full_name, x, loss, exe, program=program.org_program)
+
+    def _test_load(self, run_ipu):
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+
+        [inference_program, feed_target_names, fetch_targets] = (
+            paddle.static.load_inference_model(self.full_name, exe))
+
+        if run_ipu:
+            feed_list = feed_target_names
+            fetch_list = [fetch_targets[0].name]
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=False)
+            ipu_strategy.set_precision_config(enable_fp16=True)
+            program = paddle.static.IpuCompiledProgram(
+                inference_program,
+                ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+        else:
+            program = inference_program
+
+        feed = self.feed_fp16 if run_ipu else self.feed_fp32
+        result = []
+        for i in range(10):
+            feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype)
+            out = exe.run(program, feed=feed, fetch_list=[fetch_targets])
+            result.append(out)
+
+        return np.array(result)
+
+    def test_base(self):
+        self._test_save()
+        cpu_res = self._test_load(False)
+        ipu_res = self._test_load(True).astype(np.float32)
+
+        self.assertTrue(
+            np.allclose(
+                cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16))
+
+        shutil.rmtree(self.attrs['path'], True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index d5be8ae0cf77526a6aefa1fd060a510689c43cc0..01a56fd14be04b280eef86c3088b1b79f180815a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[10, 20]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[10, 20])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
+
                 out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[100]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[100])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index ca8c0935d782cc275838871e2c73a3eba9454e5b..602289f3f19041151e5b919f402a986c77207bd6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_atol(self):
-        self.atol = 1e-3
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"approximate": False}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.gelu(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
-@unittest.skip('approximate=True is not supported')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 2e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {"approximate": True}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a37dcb3d51475e896d6a8f1e9458b31ece85b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -0,0 +1,140 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.randn(3, 4, 5)
+        y = np.random.randn(3, 4, 5)
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.greater_than(x, y, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.zeros([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index eb644c2c6670f5beedbb3ad0b1868bc8a0f9434e..102e764cb2f172b369c064fcffabc83aa9497e21 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,43 +26,49 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 8, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 8, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 8,
             "epsilon": 1e-05,
             "data_layout": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -78,62 +78,68 @@ class TestBase(IPUOpTest):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.group_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.group_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -147,11 +153,15 @@ class TestTrainCase1(TestBase):
         self.epoch = 10
 
 
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 7e-4
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -163,7 +173,5 @@ class TestTrainCase2(TestBase):
         self.epoch = 10
 
 
-# not support `group_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
similarity index 78%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
rename to python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
index 0a331d804545d49eeaffdf0c8054db89041f2c29..33a63a80e3bc0dc6aed50f39029b094461562d62 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
@@ -12,59 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed = {"in_0": data.astype(np.float32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
     def _test_save(self):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
         self.full_name = '/'.join(
-            [self.attrs['path'], self.attrs['model_name']])
+            [self.attrs['path'].name, self.attrs['model_name']])
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -88,16 +88,16 @@ class TestBase(IPUOpTest):
                         elif self.attrs['opt_type'] == 'lamb':
                             lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
                             lamb.minimize(loss)
-                    fetch_list = [loss.name]
+                fetch_list = [loss.name]
 
                 place = paddle.IPUPlace()
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -125,8 +125,8 @@ class TestBase(IPUOpTest):
             feed_list = feed_target_names
             fetch_list = [fetch_targets[0].name]
             ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(is_training=False)
-            program = compiler.IPUCompiledProgram(
+            ipu_strategy.set_graph_config(is_training=False)
+            program = paddle.static.IpuCompiledProgram(
                 inference_program,
                 ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
         else:
@@ -134,7 +134,7 @@ class TestBase(IPUOpTest):
 
         tmp = exe.run(program, feed=self.feed, fetch_list=[fetch_targets])
 
-        return tmp
+        return np.array(tmp)
 
     def test_base(self):
         self._test_save()
@@ -142,27 +142,26 @@ class TestBase(IPUOpTest):
         ipu_res = self._test_load(True)
 
         self.assertTrue(np.allclose(cpu_res, ipu_res, atol=self.atol))
-
-        shutil.rmtree(self.attrs['path'], True)
+        self.attrs['path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index ee9cd875cf29884da58dc7f0488e7f9bfc50d4e5..ed8f3950ace82c13cd41a62d5a6e13055149187a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,39 +26,45 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"epsilon": 1e-05}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -74,58 +74,64 @@ class TestBase(IPUOpTest):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.instance_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.instance_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res)
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
@@ -134,7 +140,5 @@ class TestTrainCase1(TestBase):
         self.epoch = 10
 
 
-# not support `instance_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
deleted file mode 100644
index aa6c05dc59a87f844c19912be484a4b007f0adfc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestBase(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_feed()
-        self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        np_data = np.random.uniform(low=-1, high=1, size=[1, 3, 100, 100])
-        self.feed_ipu = {"x": np_data.astype('float16')}
-        self.feed_cpu = {"x": np_data.astype('float32')}
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed_cpu.values()
-        ]
-
-    def set_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                add1 = conv1 + conv2
-                conv3 = paddle.static.nn.conv2d(
-                    add1, num_filters=8, filter_size=8, bias_attr=False)
-                out = paddle.fluid.layers.relu(conv3, **self.attrs)
-                fetch_list = [out.name]
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            feed = self.feed_ipu if run_ipu else self.feed_cpu
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=False)
-                ipu_strategy.SetHalfConfig(enable_fp16=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                feed_list = self.feed_list
-                program = main_prog
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(res0.shape == res1.shape)
-        mae = np.mean(np.abs(res0.flatten() - res1.flatten()))
-        print("mae is ", mae)
-        self.assertTrue(mae < 0.001)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
deleted file mode 100644
index beab68553d723c5c2277861a74a8c7afaa4d2c38..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            c = b + 1  # scale, ipu_stage : 1
-            with paddle.fluid.ipu_shard(ipu_stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.fluid.ipu_shard(ipu_stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.fluid.ipu_shard(ipu_stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.fluid.ipu_shard(ipu_stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.fluid.ipu_shard(ipu_stage=2):
-            g = f - 1  # scale, ipu_stage : 2
-
-        h = g + 1  # scale, ipu_stage : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_stage"):
-                ipu_index_list.append(op.desc.attr("ipu_stage"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
deleted file mode 100644
index 48ab046deb37038c07a8d02b6ca6a8638d58fa71..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-sys.path.append("..")
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuPlace(unittest.TestCase):
-    def test_ipu_place(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            place = paddle.IPUPlace()
-            p = fluid.core.Place()
-            p.set_place(place)
-            self.assertTrue(p.is_ipu_place())
-
-    def test_ipu_set_device(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            paddle.set_device('ipu')
-            device = paddle.get_device()
-            self.assertTrue(device == "ipus:{{0-{}}}".format(num_devices - 1))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
deleted file mode 100644
index 368556d8b2f2d485fe09a6503668559c4e73911e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            c = b + 1  # scale, ipu_index : 1
-            with paddle.fluid.ipu_shard(ipu_index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.fluid.ipu_shard(ipu_index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.fluid.ipu_shard(ipu_index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.fluid.ipu_shard(ipu_index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
-
-        with paddle.fluid.ipu_shard(ipu_index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_index"):
-                ipu_index_list.append(op.desc.attr("ipu_index"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..026b19eccf18721ba7be062d3fd4516deefb2aa0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -0,0 +1,112 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuShard(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_index : no
+
+        with paddle.static.ipu_shard_guard(index=1):
+            c = b + 1  # scale, ipu_index : 1
+            with paddle.static.ipu_shard_guard(index=2):
+                d = c * 2  # scale, ipu_index : 2
+            with paddle.static.ipu_shard_guard(index=3):
+                e = d + 3  # scale, ipu_index : 3
+                with paddle.static.ipu_shard_guard(index=1):
+                    e = e + 3  # scale, ipu_index : 1
+                    with paddle.static.ipu_shard_guard(index=2):
+                        e = e + 3  # scale, ipu_index : 2
+
+        with paddle.static.ipu_shard_guard(index=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
+
+        with paddle.static.ipu_shard_guard(index=2):
+            g = f - 1  # scale, ipu_index : 2
+
+        h = g + 1  # scale, ipu_index : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuPipeline(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_stage : no
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            c = b + 1  # scale, ipu_stage : 1
+            with paddle.static.ipu_shard_guard(stage=2):
+                d = c * 2  # scale, ipu_stage : 2
+            with paddle.static.ipu_shard_guard(stage=3):
+                e = d + 3  # scale, ipu_stage : 3
+                with paddle.static.ipu_shard_guard(stage=1):
+                    e = e + 3  # scale, ipu_stage : 1
+                    with paddle.static.ipu_shard_guard(stage=2):
+                        e = e + 3  # scale, ipu_stage : 2
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
+
+        with paddle.static.ipu_shard_guard(stage=2):
+            g = f - 1  # scale, ipu_stage : 2
+
+        h = g + 1  # scale, ipu_stage : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_stage"):
+                ipu_index_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
deleted file mode 100644
index afeec9ee1b6fa75961aa76bd2f2c2f6701e200b5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestConvNet(unittest.TestCase):
-    def test_training(self):
-        ipu_strategy = paddle.static.IpuStrategy()
-
-        assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1"
-        assert ipu_strategy.is_training == True, "Default is_training is True"
-        assert ipu_strategy.enable_pipelining == False, \
-            "Default enable_pipelining is False"
-        assert ipu_strategy.enable_manual_shard == False, \
-            "Default enable_manual_shard is False"
-
-        ipu_strategy.SetGraphConfig(
-            num_ipus=2, is_training=False, enable_manual_shard=True)
-        ipu_strategy.SetPipeliningConfig(enable_pipelining=True)
-        assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed"
-
-        assert ipu_strategy.is_training == False, "Set is_training Failed"
-
-        assert ipu_strategy.enable_pipelining == True, \
-            "Set enable_pipelining Failed"
-
-        assert ipu_strategy.enable_manual_shard == True, \
-            "Set enable_manual_shard Failed"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f120f5594914e8e555bd0b024fa55224e2781f9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -0,0 +1,72 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuStrategy(unittest.TestCase):
+    def test_set_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        for option_name in all_option_names:
+            option = ipu_strategy._ipu_strategy.get_option(option_name)
+            option_type = option['type']
+            option_value = option['value']
+            if option_type in ['double']:
+                set_value = option_value + 0.5
+            elif option_type == 'uint64':
+                set_value = option_value + 1
+            elif option_type == 'bool':
+                set_value = not option_value
+            else:
+                continue
+            ipu_strategy.set_options({option_name: set_value})
+            new_value = ipu_strategy.get_option(option_name)
+            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+    def test_set_string_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {
+            'cache_path': 'paddle_cache',
+            'log_dir': 'paddle_log',
+            'partials_type_matmuls': 'half',
+            'partials_type_matmuls': 'float',
+        }
+        ipu_strategy.set_options(options)
+        for k, v in options.items():
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+    def test_set_other_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {}
+        options['dot_checks'] = ['0', '1', '2', '3']
+        options['engine_options'] = {
+            'debug.allowOutOfMemory': 'true',
+            'autoReport.directory': 'path',
+            'autoReport.all': 'true'
+        }
+        for k, v in options.items():
+            ipu_strategy.set_options({k: v})
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index 196f94b68f94a08f1b08871c805a9d7ceee14ffa..a52946bba1567b168ff1bbccbefd7bc724241eba 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,52 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 1,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -80,33 +82,38 @@ class TestBase(IPUOpTest):
                     out = paddle.fluid.layers.nn.layer_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
                 else:
-                    # scale = True
-                    # bias = True
                     scale = self.attrs['scale']
                     bias = self.attrs['shift']
                     out = paddle.fluid.layers.nn.layer_norm(
                         x, param_attr=scale, bias_attr=bias, **self.attrs)
+                loss = paddle.mean(out)
 
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
+                fetch_list = [loss.name]
 
-            if run_ipu:
+                if self.is_training:
+                    optimizer = None
+                    if self.optimizer == 'sgd':
+                        optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
+                    elif self.optimizer == 'adam':
+                        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+                    elif self.optimizer == 'lamb':
+                        optimizer = paddle.optimizer.Lamb(
+                            learning_rate=1e-2, lamb_weight_decay=0.0)
+                    if optimizer is not None:
+                        optimizer.minimize(loss)
+
+            if exec_mode:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -116,12 +123,14 @@ class TestBase(IPUOpTest):
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=self.feed_fp32,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program,
+                                 feed=self.feed_fp32,
+                                 fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
@@ -137,7 +146,7 @@ class TestBase(IPUOpTest):
 
 @unittest.skip('raise error')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": False,
             "shift": True,
@@ -148,7 +157,7 @@ class TestCase1(TestBase):
 
 @unittest.skip('raise error')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": False,
@@ -158,18 +167,28 @@ class TestCase2(TestBase):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
 
 class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 1,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'sgd'
+
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 1e-6
 
     def set_training(self):
         self.is_training = True
@@ -178,15 +197,34 @@ class TestTrainCase1(TestBase):
 
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 5e-4
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
-            "epsilon": 1e-05,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'adam'
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+class TestTrainCase3(TestBase):
+    def set_atol(self):
+        self.atol = 5e-3
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 2,
+            "epsilon": 1e-05
         }
+        self.optimizer = 'lamb'
 
     def set_training(self):
         self.is_training = True
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index dc3cab6ac5e114f7083937687c1dfedf0ebd1c44..fad7516e442a728be554cd5f5059a5229c357014 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -16,15 +16,9 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
 import paddle.nn.functional as F
-
-paddle.enable_static()
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,72 +27,81 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = F.log_softmax(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f8472890d03e69042d8e8d6db33417c9a80dea3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -0,0 +1,97 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 20, 30528])
+        self.feed = {"in_0": data.astype('bool')}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="bool")
+
+                out = paddle.fluid.layers.logical_not(x)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).astype(np.int32)
+
+        self.check(output_dict, check_shape=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 31b0c99603c3f707328eef5a3713bcd9b882b948..4a877ddce4e3ce1af9720f954b46bd414783fb03 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,16 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": data.astype(np.int64)}
+        self.feed_ipu = {"x": data.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "size": [128, 16],
             "is_sparse": False,
@@ -50,33 +53,20 @@ class TestBase(IPUOpTest):
             "dtype": 'float32'
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        if run_ipu:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int32)
-            }
-        else:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int64)
-            }
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        self.set_feed_attr()
-
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='int64')
+
                 out = paddle.fluid.layers.embedding(x, **self.attrs)
 
                 if self.is_training:
@@ -87,47 +77,61 @@ class TestBase(IPUOpTest):
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
     def set_training(self):
         self.is_training = True
         self.epoch = 10
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..da8048fb3205e5f1c8c4c0a4e32fb5f0ff264554
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -0,0 +1,141 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {"x": x.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "num_embeddings": 128,
+            "embedding_dim": 16,
+            "sparse": False,
+            "padding_idx": -1,
+            "weight_attr": None
+        }
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                embedding = paddle.nn.Embedding(**self.attrs)
+                out = embedding(x)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
rename to python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 38b91785aeec8c061a5d4f6203363645569c2824..58f018e2ae649accd27f7c139bc4eb78d4d70d9a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -19,7 +19,7 @@ import unittest
 import sys
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 from paddle.optimizer.lr import LRScheduler
 
 paddle.enable_static()
@@ -71,8 +71,8 @@ class TestConvNet(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index c6702b92ab969ec7b1b97c44bc9245c6df41d83a..6929ded6ebf90ea9081c12ed9af97cb6d7630cab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,93 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[20, 30])
+        y = np.random.uniform(size=[30, 20])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": False,
             "transpose_y": False,
             "alpha": 1.0,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.matmul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -119,55 +121,64 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
             "alpha": 3.14,
         }
 
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
 
 class TestCase3(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[5, 4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 3, 2])
+        y = np.random.uniform(size=[5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 3, 2])
+        y = np.random.uniform(size=[4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase5(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase6(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 @unittest.skip("not supported")
 class TestCase6_2(TestCase6):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -176,27 +187,36 @@ class TestCase6_2(TestCase6):
 
 
 class TestCase7(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 1]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 12, 128, 64])
+        y = np.random.uniform(size=[1, 12, 128, 64])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125}
+
+
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 @unittest.skip("not supported")
-class TestCase7_2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[2]).astype('float32'),
-        }
-        # equal to
-        # self.feed = {
-        #     "x": np.random.uniform(size=[3, 1]).astype('float32'),
-        #     "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        # }
+class TestCase8_2(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[2])
 
-    def set_attrs(self):
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -205,12 +225,12 @@ class TestCase7_2(TestBase):
 
 
 @unittest.skip("dim > 4 is not supported")
-class TestCase8(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
-        }
+class TestCase9(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[6, 5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1c115403adf2037deb8c8e2b3a87f341d1062e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -0,0 +1,186 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.matmul(x, y, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_x": True,
+            "transpose_y": True,
+        }
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 2, 3])
+        y = np.random.uniform(size=[5, 4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase4(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase6(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("not supported")
+class TestCase6_2(TestCase6):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": True, "transpose_y": True}
+
+
+class TestCase7(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("dim > 4 is not supported")
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
+            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index f04d712755deadd44158431569430909f19a093e..b9dd7358b79550ffdc059dde189aded5d427449a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,97 +26,79 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                out = paddle.mean(x, **self.attrs)
 
-                fetch_list = [out.name]
+                out = paddle.fluid.layers.mean(x)
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-class TestCase1(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 1
-        self.attrs['keepdim'] = False
-
-
-class TestCase2(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = False
-
-
-class TestCase3(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = True
-
-
-class TestCase4(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = True
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
similarity index 86%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
rename to python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
index e1ed7603ed6272ba91cf485d91f512f07b72a258..7e70239964002184819fcaabc89e32704547b22c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@ SEED = 2021
                  "core is not compiled with IPU")
 class TestCastNet(unittest.TestCase):
     def _test(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -37,14 +36,14 @@ class TestCastNet(unittest.TestCase):
 
         np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[1, 3, 10, 10], dtype='float32')
-                with fluid.ipu_shard(ipu_index=0):
+                with paddle.static.ipu_shard_guard(index=0):
                     conv1 = paddle.static.nn.conv2d(
                         image, num_filters=3, filter_size=3, bias_attr=False)
-                with fluid.ipu_shard(ipu_index=1):
+                with paddle.static.ipu_shard_guard(index=1):
                     conv2 = paddle.static.nn.conv2d(
                         conv1, num_filters=3, filter_size=3, bias_attr=False)
                     loss = paddle.mean(conv2)
@@ -60,9 +59,10 @@ class TestCastNet(unittest.TestCase):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     num_ipus=2, is_training=False, enable_manual_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_pipelining_config(enable_pipelining=False)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 78a2589d9aca59ec72d22aa6fa35f9f934f94caf..7a9135626df7902eb8ddf650f3911cd854b8b6e2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,90 +26,98 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 1,
             "y_num_col_dims": 1,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.mul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 2,
             "y_num_col_dims": 1,
@@ -123,13 +125,13 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 4, 2, 9]).astype('float32'),
-            "y": np.random.uniform(size=[3, 6, 1, 2, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 9])
+        y = np.random.uniform(size=[3, 6, 1, 2, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cc10da3d73444f329bbcbf53694c9b4ff93fdfc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -0,0 +1,165 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+
+    def set_data_feed(self):
+        self.feed = {
+            "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.0,
+            "loss_scaling": 1.0,
+        }
+
+    def _test_optimizer(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        np.random.seed(self.SEED)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(
+                    name='image', shape=[1, 3, 10, 10], dtype='float32')
+                conv1 = paddle.static.nn.conv2d(
+                    image, num_filters=3, filter_size=3, bias_attr=False)
+                loss = paddle.mean(conv1)
+
+                weight_decay = self.attrs['weight_decay']
+                opt = paddle.optimizer.SGD(learning_rate=1e-1,
+                                           weight_decay=weight_decay)
+                if self.attrs['optimizer'] == 'adam':
+                    opt = paddle.optimizer.Adam(
+                        learning_rate=1e-1, weight_decay=weight_decay)
+                elif self.attrs['optimizer'] == 'lamb':
+
+                    opt = paddle.optimizer.Lamb(
+                        learning_rate=1e-1, lamb_weight_decay=weight_decay)
+                opt.minimize(loss)
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = [image.name]
+                fetch_list = [loss.name]
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
+                                                                  fetch_list)
+            else:
+                program = main_prog
+
+            result = []
+            for epoch in range(100):
+                loss_res = exe.run(program, feed=self.feed, fetch_list=[loss])
+                result.append(loss_res)
+
+            return np.array(result)
+
+    def test(self):
+        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
+        ipu_loss = self._test_optimizer(True).flatten()
+        cpu_loss = self._test_optimizer(False).flatten()
+
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol))
+
+
+@unittest.skip('do not support L2 regularization')
+class TestSGD(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.1,
+            "loss_scaling": 2.0,
+        }
+
+
+@unittest.skip('do not support L2 regularization')
+class TestAdamCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.1,
+            "loss_scaling": 3.0,
+        }
+
+
+class TestAdamCase2(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLambCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "loss_scaling": 5.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLamb(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index e81591ad68368033bc9b5223753203ae3126c005..4288b82832edef7068b1b692d55ed85789e9c0d1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'avg',
@@ -60,53 +56,59 @@ class TestBase(IPUOpTest):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index a7c45c6686f10e3c13462d49ab8e34fe72c4f03a..911a163b8aa9c91e159b5fc8e15041d564c648cc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'max',
@@ -60,120 +56,126 @@ class TestBase(IPUOpTest):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = 3
 
 
 class TestCase1_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = [3, 1]
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = 2
 
 
 class TestCase2_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = [2, 1]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1]
 
 
 class TestCase3_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1, 2, 2]
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'VALID'
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'SAME'
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['global_pooling'] = True
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['ceil_mode'] = True
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['exclusive'] = False
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 5059de7ba77b1baf65c511da11735a73a87aa1e8..b3562d722c4e6aaf5578e321bbb225c5e4a247b0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,124 +26,146 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"factor": 2.0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.array([2.0]).astype('float32'),
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 2, 2])
+        data2 = np.array([2.0])
+
+        self.feed_fp32 = {
+            "x": data1.astype(np.float32),
+            "y": data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": data1.astype(np.float16),
+            "y": data2.astype(np.float16)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 factor = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9454e5945f7d61c6b4fc92ffaccd80895180e04
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -0,0 +1,143 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype=self.feed_dtype[0])
+                out = paddle.fluid.layers.conv2d(
+                    x, num_filters=3, filter_size=3)
+                out = paddle.fluid.layers.Print(out, **self.attrs)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=self.feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1.flatten(), atol=self.atol))
+
+        self.assertTrue(res0.shape == res1.shape)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"message": "input_data"}
+
+
+class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        # "forward" : print forward
+        # "backward" : print forward and backward
+        # "both": print forward and backward
+        self.attrs = {"message": "input_data2", "print_phase": "both"}
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 2
+
+
+@unittest.skip("attrs are not supported")
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "first_n": 10,
+            "summarize": 10,
+            "print_tensor_name": True,
+            "print_tensor_type": True,
+            "print_tensor_shape": True,
+            "print_tensor_layout": True,
+            "print_tensor_lod": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index ac8ad08e8b28c00555c5c78f5b8a834b0024acc7..929ee51b650946dca45fd2b8302ba10c61a99ceb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,137 @@ class TestMean(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_mean
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = self.op(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-    def set_feed0(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 4]).astype(np.float32)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+    def set_data_feed0(self):
+        data = np.random.uniform(size=[2, 4])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_feed1(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 2, 2]).astype(np.float32)
+    def set_data_feed1(self):
+        data = np.random.uniform(size=[2, 2, 2])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_attr0(self):
+    def set_op_attr0(self):
         self.attrs = {}
         self.attrs['dim'] = None
         self.attrs['keep_dim'] = False
 
     def test_case0(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.run_test_base()
 
     def test_case1(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 0
         self.run_test_base()
 
     def test_case2(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = -1
         self.run_test_base()
 
     def test_case3(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 1
         self.run_test_base()
 
     def test_case4(self):
-        self.set_feed0()
+        self.set_data_feed0()
         self.attrs = {}
         self.attrs['dim'] = 1
         self.attrs['keep_dim'] = True
         self.run_test_base()
 
     def test_case5(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [1, 2]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case6(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case7(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = True
@@ -158,22 +164,22 @@ class TestMean(IPUOpTest):
 
 
 class TestMax(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_max
 
 
 class TestMin(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_min
 
 
 class TestProd(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_prod
 
 
 class TestSum(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_sum
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index f312b7b69ad79b721fd768f6762a48ca68793d6d..9ddf5c7537fdcd93d71c9130ca92d3bfb4caea52 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 add = paddle.fluid.layers.elementwise_add(x, x)
                 out = paddle.fluid.layers.reshape(add, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 5163838bc0cd633f69e2e446294250686e4fe04f..119771931701c9d790eeceec9604bec756e56537 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,82 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([2, 4, 6])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [6, 8]
         self.attrs['inplace'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.reshape(x=x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [2, 3, -1, 2]
         self.attrs['inplace'] = False
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [-1, 0, 3, 2]
         self.attrs['inplace'] = False
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
similarity index 58%
rename from python/paddle/fluid/tests/unittests/ipu/test_save_load.py
rename to python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 24bb8e111842cb93ed35cdc796868c5a911ee36f..3a694873062080d49237299afe6a4171dc4fa242 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -12,55 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
     def _test_base(self, save_otherwise_load):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -91,12 +88,17 @@ class TestBase(IPUOpTest):
                 exe.run(startup_prog)
 
                 if not save_otherwise_load:
-                    paddle.static.load(main_prog, "model/model")
+                    paddle.static.load(main_prog, self.attrs['model_path'].name)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_precision_config(
+                    enable_fp16=self.attrs['enable_fp16'])
+                ipu_strategy.set_options({
+                    'save_per_n_step': self.attrs['save_at_step']
+                })
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -104,16 +106,17 @@ class TestBase(IPUOpTest):
                 run_steps = self.attrs['steps'] if save_otherwise_load \
                     else self.attrs['steps'] - self.attrs['save_at_step']
 
+                feed = self.feed_fp16 if self.attrs[
+                    'enable_fp16'] else self.feed_fp32
                 for i in range(run_steps):
-                    tmp = exe.run(program,
-                                  feed=self.feed,
-                                  fetch_list=fetch_list)
+                    tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
                     # currently, we update opt state every sess.run,
                     # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
-                        paddle.static.save(main_prog, "model/model")
+                        paddle.static.save(main_prog,
+                                           self.attrs['model_path'].name)
 
                     if save_otherwise_load and i >= self.attrs['save_at_step']:
                         result.append(tmp)
@@ -129,25 +132,65 @@ class TestBase(IPUOpTest):
         self.assertTrue(
             np.allclose(
                 res0.flatten(), res1.flatten(), atol=self.atol))
-        shutil.rmtree("model", True)
+        self.attrs['model_path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestSGDFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestAdamFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestLambFP16(TestBase):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 6ad2a89a738b7090ce082a637c3d531cf4566f9a..49714eba8d4d1ef926847e854617e543491a46fe 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 5.0,
             "bias": 0.0,
@@ -114,7 +116,7 @@ class TestCase1(TestBase):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.5,
@@ -123,7 +125,16 @@ class TestCase2(TestBase):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": 5.0,
+            "bias": 0.7,
+            "bias_after_scale": True,
+        }
+
+
+class TestCase4(TestBase):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
@@ -131,59 +142,66 @@ class TestCase3(TestBase):
         }
 
 
-class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 3, 10, 10]).astype('float32'),
-            "y": np.array([3.0]).astype('float32'),
-        }
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 3, 10, 10])
+        y = np.array([3.0])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 93945b98ef0a26b35b28e1cf9d2bb5e88d21f308..9a18922f35331bd23ab9cd40ff6a4dea1f446ce5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -16,13 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,36 +26,46 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([-1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=[2, 3, 128, 128]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 128, 128])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 conv2 = paddle.static.nn.conv2d(
@@ -70,36 +75,45 @@ class TestBase(IPUOpTest):
                 conv4 = paddle.static.nn.conv2d(
                     conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+            fetch_list = [conv4.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    batch_size=2, is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                # set batch size
+                ipu_strategy.micro_batch_size = 2
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
deleted file mode 100644
index df0e2a040bd3e55aac20e383c7aa2ff50c567de4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestSGD(unittest.TestCase):
-    def _test_sgd(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                sgd = paddle.optimizer.SGD(learning_rate=1e-1)
-                sgd.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                loss_res = exe.run(program,
-                                   feed={"image": np_image},
-                                   fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
-
-    def test_sgd(self):
-        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test_sgd(True).flatten()
-        cpu_loss = self._test_sgd(False).flatten()
-
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 3bdfeabce6592cd25067adfdabe8b7c74a6848c7..8881f018de3b538bd350167793133cd8460aa37b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,78 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[4, 5, 6]).astype('float32'), }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[4, 5, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1, 2],
             "starts": [-3, 0, 2],
             "ends": [3, 2, 4],
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.slice(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1],
             "starts": [0, 0],
@@ -113,38 +117,45 @@ class TestCase1(TestBase):
 
 @unittest.skip('dynamic graph is not support on IPU')
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 5, 6]).astype('float32'),
-            "starts": np.array([0, 0, 2]).astype('int32'),
-            "ends": np.array([3, 2, 4]).astype('int32'),
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 5, 6])
+        s = np.array([0, 0, 2])
+        e = np.array([3, 2, 4])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0, 1, 2]}
 
     def _test_base(self, run_ipu=True):
         scope = fluid.core.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
         with fluid.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 starts = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
                 ends = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='int32')
                 out = paddle.fluid.layers.slice(
                     x, starts=starts, ends=ends, **self.attrs)
 
@@ -160,8 +171,8 @@ class TestCase2(TestBase):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -170,6 +181,9 @@ class TestCase2(TestBase):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
+    def test_base(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index a4a4b83baf35e558ff1f8b0982ffc44287919dfc..25201959cecbc4941b0481bffd7e4dfee93ef6ce 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 20])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.softmax(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..59af3a3d6ac17a87a2af413d0f5a283c69d8f8e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -0,0 +1,113 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+
+        self.feed_fp32 = {'x': data1.astype(np.float32)}
+        self.feed_fp16 = {'x': data1.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.split(x, **self.attrs)
+
+                fetch_list = [fetch.name for fetch in out]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled
+                ) or mode == ExecutionMode.IPU_POPART_FP16:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [2, 8], "axis": 2}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index ccd2796590838faa8980ef7d214f73a944c9220d..bdc8fb32c84722815d53f113e8a9be436e7e8b0f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,81 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 1, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 1, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.squeeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
-                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": []}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [-2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 3d5de11b5e213e3fe68561d60c1c0dbcb6fbcbf1..c807ab9aab65e4662c149a72a0cb62349c48826e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,102 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 2]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2])
+        y = np.random.uniform(size=[1, 2])
+        z = np.random.uniform(size=[1, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": z.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": z.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 003350cd7a01e284d28ff8904a38ab3755e07642..12351cb63d6c8b5b95ae2f26de2ea17f68759a7b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,131 +26,154 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 @unittest.skip('')
 class TestCase1(TestBase):
     def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        z = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": y.astype(np.float16)
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index 9915a7a1fd89f91f71814ebbe577abcf9327cb37..ef75aee78049b511f796317954525705c4c025f9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -16,130 +16,125 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestTopKOp(IPUOpTest):
     def setUp(self):
-        self.set_ops()
         self.set_atol()
         self.set_training()
-        self.k = 3
-        self.use_K_as_const_variable = False
-
-        self.set_feed()
-        self.set_attrs()
-
-    def set_ops(self):
-        self.ops = [
-            paddle.fluid.layers.topk,
-            paddle.topk  # use top_k_v2 implementation
-        ]
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([3, 5])
-
-        self.feed = {}
-        self.feed_list = []
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-        self.feed_list.append("in_0")
-        if self.use_K_as_const_variable:
-            # self.feed["in_1"] = np.array([self.k]).astype("int32")
-            # self.feed_list.append("in_1")
-            pass
-        print("[TestTopKop] feed data:\n%s" % self.feed["in_0"])
-
-    def set_attrs(self):
-        self.attrs = {
-            # "axis": -1,
-            # "sorted": True
-        }
-        if not self.use_K_as_const_variable:
-            self.attrs["k"] = self.k
-
-    def _test_base(self, run_ipu=True, op=None, data_feed=None):
-        assert (op is not None)
-        assert (data_feed is not None)
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_test_op()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.topk
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = False
+        self.attrs = {}
+        if not self.use_k_as_const_variable:
+            self.attrs["k"] = 3
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                if not self.use_K_as_const_variable:
-                    topk_values, topk_indices = op(x, **self.attrs)
+
+                if not self.use_k_as_const_variable:
+                    topk_values, topk_indices = self.op(x, **self.attrs)
                 else:
                     # !important, popart cannot accept non const tensor
-                    # K_t = paddle.static.data(name="in_1", shape=[1], dtype='int32')
-                    K_t = fluid.layers.fill_constant(
+                    K_t = paddle.fluid.layers.fill_constant(
                         shape=[1], dtype='int32', value=self.k, name="in_2")
-                    topk_values, topk_indices = op(x, K_t, **self.attrs)
+                    topk_values, topk_indices = self.op(x, K_t, **self.attrs)
+
                 fetch_list = [topk_values.name, topk_indices.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            print("Running inference ...")
-            result = exe.run(program, feed=data_feed, fetch_list=fetch_list)
-            print("Complete running infrence.")
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result
 
     def test_base(self):
-        for op in self.ops:
-            res0_topk_values, res0_topk_indices = self._test_base(
-                True, op=op, data_feed=self.feed)
-            res1_topk_values, res1_topk_indices = self._test_base(
-                False, op=paddle.fluid.layers.topk, data_feed=self.feed)
-
-            print("[TestTopKop] IPU res0 values:\n%s\n" % res0_topk_values)
-            print("[TestTopKop] CPU res1 values:\n%s\n" % res1_topk_values)
-            view_type = np.uint32
-            print("[TestTopKop] IPU res0 indices:\n%s\n" %
-                  res0_topk_indices.astype(view_type))
-            print("[TestTopKop] CPU res1 indices:\n%s\n" % res1_topk_indices)
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_values.flatten(),
-                    res1_topk_values.flatten(),
-                    atol=self.atol))
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_indices.astype(view_type).flatten(),
-                    res1_topk_indices.flatten(),
-                    atol=self.atol))
+        value_dict = {}
+        index_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            value, index = self._test_base(mode)
+            value_dict[mode] = value
+            index_dict[mode] = index
+
+        self.check(value_dict)
+        self.check(index_dict)
+
+
+class TestCase2(TestTopKOp):
+    def set_test_op(self):
+        self.op = paddle.topk
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase3(TestTopKOp):
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = True
+        self.attrs = {}
+        self.k = 2
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase4(TestCase3):
+    def set_test_op(self):
+        self.op = paddle.topk
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 77d2f4131014965ef6cfba9cdf5efffa34c2dbc6..1747bde20b6a63c9bde705f3e7c998b7d2033a94 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,94 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 2, 3, 1]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.transpose(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 1, 2, 3]}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 3, 4, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3, 4, 5])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [4, 0, 2, 3, 1]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 75ed5a07315c775e4ea3e105a1fa4d6731666c70..e068c2e3b59083a9623ae9cf2a6025cde4fe18ea 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -16,14 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,79 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[1, 2, 3]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": -1}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [1, 2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
rename to python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
index fabad936decb975214f0416000d77c76a2f7ddfb..5cc62432dc635bcbeca727fcb8eae5ea48849ffe 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
@@ -16,15 +16,8 @@ import unittest
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-from paddle.fluid.executor import global_scope
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,11 +26,11 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
         }
@@ -45,25 +38,22 @@ class TestBase(IPUOpTest):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -76,12 +66,13 @@ class TestBase(IPUOpTest):
                 scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5)
                 scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7)
 
-                fetch_list = [scale3.name]
+            fetch_list = [scale3.name]
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
             scale1_out = main_prog.global_block().ops[4].output("Out")[0]
@@ -92,8 +83,8 @@ class TestBase(IPUOpTest):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf1c61f52e83261cca4016b062bc327a4c062dd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestWeightSharing(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.randint(0, 768, size=(128, 1)).astype(np.int32)
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {
+            "x": np.tile(x.astype(np.int64)[np.newaxis, :], [3, 1, 1])
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    y = paddle.fluid.layers.embedding(
+                        input=x,
+                        size=[768, 768],
+                        dtype='float32',
+                        param_attr=paddle.fluid.ParamAttr(
+                            name='word_embedding'),
+                        is_sparse=False)
+
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    z = paddle.fluid.layers.fc(
+                        input=y,
+                        size=768,
+                        param_attr=paddle.fluid.ParamAttr(name="fc"))
+
+                with paddle.static.ipu_shard_guard(index=0, stage=2):
+                    out = paddle.fluid.layers.matmul(
+                        x=z,
+                        y=main_prog.global_block().var('word_embedding'),
+                        transpose_y=True)
+
+            fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(
+                    num_ipus=2,
+                    is_training=self.is_training,
+                    enable_manual_shard=True)
+                ipu_strategy.set_pipelining_config(
+                    enable_pipelining=True, batches_per_step=3)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_ipu if run_ipu else self.feed_cpu
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1[0].flatten(), atol=self.atol))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 6169509e895184e599f5620a4aa6acdde6470792..8f7b73fc0e03630f0d1c8a64ed9118f1322c5b65 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -80,7 +80,7 @@ if(WITH_NV_JETSON)
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
 else()
-  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 120)
+  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 300)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
 endif()
 set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..893bd3833430c1059d5251ae6039c274112cbddb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.core import PassVersionChecker
+
+
+class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
+    act_alpha = None
+    act_beta = None
+    pass_name = 'elt_act_mkldnn_fuse_pass'
+
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_A = fluid.data(
+                name="data_A", shape=[-1, 3, 100, 100], dtype="float32")
+            data_B = fluid.data(
+                name="data_B", shape=[-1, 3, 100, 100], dtype="float32")
+            elt_out = self.operand(data_A, data_B)
+            if self.act is not None:
+                if self.act_beta is not None:
+                    elt_out = self.act(elt_out, self.act_alpha, self.act_beta)
+                elif self.act_alpha is not None:
+                    elt_out = self.act(elt_out, self.act_alpha)
+                else:
+                    elt_out = self.act(elt_out)
+
+        self.feeds = {
+            "data_A": np.random.random((1, 3, 100, 100)).astype("float32"),
+            "data_B": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [elt_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = None
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act_alpha = 4
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.sqrt
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_add
+        self.act = fluid.layers.sigmoid
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_sub
+        self.act = fluid.layers.sigmoid
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Relu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.tanh
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act_alpha = 0.2
+        self.act = fluid.layers.leaky_relu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Swish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.hard_swish
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.sqrt
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.abs
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.clip
+        self.act_alpha = 0.0
+        self.act_beta = 10.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.gelu
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.gelu
+        self.act_alpha = True
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.relu6
+        self.act_alpha = 5.0
+
+
+class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid(
+        ElementwiseActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.operand = fluid.layers.elementwise_mul
+        self.act = fluid.layers.sigmoid
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f5279b0edadd61467c3bebe55dfb83aea909267
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestElementWiseAddReluFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            return np.random.random(
+                [batch_size, 3, 100, 100]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "elementwise_add",
+            "op_inputs": {
+                "X": ["A"],
+                "Y": ["B"]
+            },
+            "op_outputs": {
+                "Out": ["add_output"]
+            },
+            "op_attrs": {}
+        }, {
+            "op_type": "relu",
+            "op_inputs": {
+                "X": ["add_output"]
+            },
+            "op_outputs": {
+                "Out": ["relu_output"]
+            },
+            "op_attrs": {}
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "A": TensorConfig(data_gen=partial(generate_input)),
+                "B": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["relu_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["elementwise_add"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["elt_act_mkldnn_fuse_pass"], min_success_num=4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index e21d67839eb6c0f097ca82748c1b7d03ca39d8d3..65fc35f9c56f88fec8fed6b3ecdadb10ddcc8c8f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -37,6 +37,13 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -175,9 +182,9 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -187,41 +194,18 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
index 9d29034d7fe18d069dd7bc3b4651a4c97d13976a..c692e92861bc6cc6bae2354e287c2554639af825 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
@@ -147,7 +147,7 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
             if len(attrs[0]['paddings']) == 4:
                 return 1, 2
             else:
-                return 1, 2
+                return 1, 4
 
         attrs = [
             program_config.ops[i].attrs
@@ -160,20 +160,8 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), 1e-5
 
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(program_config.ops[0].attrs["strides"]) != 2:
-                return False
-
-            return True
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "In deformable conv, length of Attr(strides) should be 2.")
-
     def test(self):
         self.trt_param.workspace_size = 1 << 28
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 66a007f64b69c01fe6bb197f634625a526c3a796..5f77e7de0df423228d97372e6ccc42dfe458cf9a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -40,6 +40,13 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         if inputs['input_data'].shape[1] != attrs[0]['groups']:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -139,9 +146,9 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -151,41 +158,18 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
index 356a2c942df0d8cc5d1f016a3b2f4a284227990f..1eecf9c0497a196666c4b30af721c7b68d77b0fd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
@@ -244,28 +244,16 @@ class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest):
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 5), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 5), 1e-5
+        yield self.create_inference_config(), (0, 5), 2e-2
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 4), 1e-5
-
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Half and len(
-                    self.dynamic_shape.min_input_shape) != 0:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt when dynamic fp16 mode.")
+        yield self.create_inference_config(), (1, 4), 2e-2
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 9bcbbf95990f2046edd982870240c9d669f920ee..852bb2ffa8412353ec152d76f9d6843f1654e862 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -138,7 +138,7 @@ class TrtConvertGatherTest(TrtLayerAutoScanTest):
                     "index_data": [1]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data": [128, 256, 128, 256],
+                    "input_data": [128, 256, 64, 128],
                     "index_data": [4]
                 }
                 self.dynamic_shape.opt_input_shape = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index 2d2072d277e9c50e74aea27ed82144d979566cbc..97a94ef348a678bdf59c0058746f19b78ae27f9b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -451,10 +451,394 @@ class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
             "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2."
         )
 
+        def teller3(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt in int8 mode.")
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
 
 
+class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
+    def sample_program_configs(self):
+        def generate_input1(batch, dim1):
+            return np.random.random((batch, dim1, 768)).astype(np.float32)
+
+        def generate_input2(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight1():
+            return np.random.random((768, 768)).astype(np.float32)
+
+        def generate_weight2():
+            return np.random.random(768).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            self.batch = batch
+            for reshape_shape in [[0, 0, 12, 64]]:
+                for dim1 in [128]:
+                    input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
+                                     [batch, 1, 1, dim1]]
+                    for input2_shape in input2_shapes:
+                        for axis in [0]:
+                            dics = [{
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1,
+                                "enable_int8": True,
+                                "X_scale": 1.0,
+                                "weight_scale": [1.0],
+                            }, {
+                                "axis": 2,
+                                "out_threshold": 1.0,
+                            }, {
+                                "shape": reshape_shape
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "scale": 0.125,
+                                "bias": 0.0,
+                                "bias_after_scale": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": True,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": axis
+                            }, {
+                                "axis": -1,
+                                "is_test": True
+                            }, {
+                                "seed": 0,
+                                "dropout_prob": 0.10000000149011612,
+                                "dropout_implementation": "upscale_in_train",
+                                "fix_seed": False,
+                                "is_test": True
+                            }, {
+                                "alpha": 1.0,
+                                "transpose_X": False,
+                                "transpose_Y": False,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": []
+                            }, {
+                                "axis": [0, 2, 1, 3]
+                            }, {
+                                "shape": [0, 0, 768]
+                            }, {
+                                "x_num_col_dims": 2,
+                                "y_num_col_dims": 1
+                            }]
+
+                            ops_config = [
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul1_output"]
+                                    },
+                                    "op_attrs": dics[0]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul1_output"],
+                                        "Y": ["elementwise_add1_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add1_output"]
+                                    },
+                                    "op_attrs": dics[1]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add1_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape21_output"],
+                                        "XShape": ["reshape21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[2]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape21_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose21_output"],
+                                        "XShape":
+                                        ["transpose21_output_xshape"]
+                                    },
+                                    "op_attrs": dics[3]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul2_output"]
+                                    },
+                                    "op_attrs": dics[4]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul2_output"],
+                                        "Y": ["elementwise_add2_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add2_output"]
+                                    },
+                                    "op_attrs": dics[5]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape22_output"],
+                                        "XShape": ["reshape22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[6]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape22_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose22_output"],
+                                        "XShape":
+                                        ["transpose22_output_xshape"]
+                                    },
+                                    "op_attrs": dics[7]
+                                },
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["input_data1"],
+                                        "Y": ["mul3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul3_output"]
+                                    },
+                                    "op_attrs": dics[8]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["mul3_output"],
+                                        "Y": ["elementwise_add3_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add3_output"]
+                                    },
+                                    "op_attrs": dics[9]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add3_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape23_output"],
+                                        "XShape": ["reshape23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[10]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["reshape23_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose23_output"],
+                                        "XShape":
+                                        ["transpose23_output_xshape"]
+                                    },
+                                    "op_attrs": dics[11]
+                                },
+                                {
+                                    "op_type": "scale",
+                                    "op_inputs": {
+                                        "X": ["transpose23_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["scale_output"]
+                                    },
+                                    "op_attrs": dics[12]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["scale_output"],
+                                        "Y": ["transpose22_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul1_output"]
+                                    },
+                                    "op_attrs": dics[13]
+                                },
+                                {
+                                    "op_type": "elementwise_add",
+                                    "op_inputs": {
+                                        "X": ["matmul1_output"],
+                                        "Y": ["input_data2"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["elementwise_add4_output"]
+                                    },
+                                    "op_attrs": dics[14]
+                                },
+                                {
+                                    "op_type": "softmax",
+                                    "op_inputs": {
+                                        "X": ["elementwise_add4_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["softmax_output"]
+                                    },
+                                    "op_attrs": dics[15]
+                                },
+                                {
+                                    "op_type": "dropout",
+                                    "op_inputs": {
+                                        "X": ["softmax_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["dropout3_output"]
+                                    },
+                                    "op_attrs": dics[16]
+                                },
+                                {
+                                    "op_type": "matmul",
+                                    "op_inputs": {
+                                        "X": ["dropout3_output"],
+                                        "Y": ["transpose21_output"],
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["matmul2_output"]
+                                    },
+                                    "op_attrs": dics[17]
+                                },
+                                {
+                                    "op_type": "transpose2",
+                                    "op_inputs": {
+                                        "X": ["matmul2_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["transpose24_output"],
+                                        "XShape":
+                                        ["transpose24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[18]
+                                },
+                                {
+                                    "op_type": "reshape2",
+                                    "op_inputs": {
+                                        "X": ["transpose24_output"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["reshape24_output"],
+                                        "XShape": ["reshape24_output_xshape"]
+                                    },
+                                    "op_attrs": dics[19]
+                                },
+                                # In order to fuse ops with 
+                                # multihead_matmul_fuse_pass_v2, the last op
+                                # must be mul.
+                                {
+                                    "op_type": "mul",
+                                    "op_inputs": {
+                                        "X": ["reshape24_output"],
+                                        "Y": ["mul4_weight"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["mul4_output"]
+                                    },
+                                    "op_attrs": dics[20]
+                                }
+                            ]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={
+                                    "mul1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "mul4_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)),
+                                    "elementwise_add1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                    "elementwise_add3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)),
+                                },
+                                inputs={
+                                    "input_data1": TensorConfig(
+                                        data_gen=partial(generate_input1, batch,
+                                                         dim1)),
+                                    "input_data2": TensorConfig(
+                                        data_gen=partial(generate_input2,
+                                                         input2_shape)),
+                                },
+                                outputs=["mul4_output"])
+
+                            yield program_config
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index c17790bd3200e2bac9841c1198572af6e1740420..340378225261ff829e4faa334a471cea8e88f4d9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -13,13 +13,19 @@ if (WITH_MLU)
     endforeach(TEST_OP)
 
     if(WITH_CNCL)
-	foreach(TEST_OP ${TEST_DIST_OPS})
+        foreach(TEST_OP ${TEST_DIST_OPS})
             py_test_modules(${TEST_OP} MODULES ${TEST_OP})
         endforeach(TEST_OP)
         bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
+	set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
     endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91f28e3b1db8687957d57e5fd22ef4a791cd2cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+import paddle
+
+paddle.enable_static()
+
+
+class TestCCommInitOp(unittest.TestCase):
+    def setUp(self):
+        self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
+        self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        self.nranks = len(self.endpoints)
+        self.rank = self.endpoints.index(self.current_endpoint)
+        self.mlu_id = int(os.getenv("FLAGS_selected_mlus"))
+        self.place = fluid.MLUPlace(self.mlu_id)
+        self.exe = fluid.Executor(self.place)
+        self.endpoints.remove(self.current_endpoint)
+        self.other_endpoints = self.endpoints
+        if self.rank == 0:
+            wait_server_ready(self.other_endpoints)
+
+    def test_specifying_devices(self):
+        program = fluid.Program()
+        block = program.global_block()
+        cncl_id_var = block.create_var(
+            name=fluid.unique_name.generate('cncl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': self.rank,
+                'endpoint': self.current_endpoint,
+                'other_endpoints': self.other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': self.nranks,
+                'rank': self.rank,
+                'ring_id': 0,
+                'device_id': self.mlu_id
+            })
+        self.exe.run(program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
new file mode 100755
index 0000000000000000000000000000000000000000..50ae6b1a169d7704e55cf2486e396e6e3b0663f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..1058514f9ca24c6c9be8e6e48499c81c158a82ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allgather",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgather, "allgather", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe4e71d22fdee38779766177031bfcea64c6974
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..2002909ea2eec5e85b91fad0c53ed2c480a01ef0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype="float32")
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..57fa56acd687582fa67c1592a7d5c505ca6cce06
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestCheckFiniteAndUnscaleOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "check_finite_and_unscale"
+        self.init_dtype()
+        self.init_test_case()
+
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x / scale)],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.nan
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains nan, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        x1[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..97f21798c115456852d0752f22f7534bbd3ab0d1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# use default values
+# FIXME: random fails on Unknown command lines -c (or -m).
+launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
+MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
new file mode 100644
index 0000000000000000000000000000000000000000..09166e15aac8191c39047aee1fb12cacf0195d55
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllgatherOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_fp32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float32")
+
+    def test_allgather_fp16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "float16")
+
+    def test_allgather_int32(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int32")
+
+    def test_allgather_int16(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "int16")
+
+    def test_allgather_int8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather", "int8")
+
+    def test_allgather_uint8(self):
+        self.check_with_place("collective_allgather_op.py", "allgather",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
new file mode 100755
index 0000000000000000000000000000000000000000..576c310cc3ac2957cd659c248774a34ece422e87
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_cncl_fp16(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float16")
+
+    def test_allgather_cncl_fp32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "float32")
+
+    def test_allgather_cncl_int32(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..447498b9022d4b544eeff644f8a5dcd8afb4a800
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_cncl_fp16(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_cncl_fp32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_cncl_int32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c1cf7d2d1b2ba010511f2056759bae72328c16f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import os
+import sys
+import subprocess
+import pickle
+from contextlib import closing
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+def DataTypeCast(date_type):
+    np_data_type = None
+
+    if date_type == "float16":
+        np_data_type = np.float16
+    elif date_type == "float32":
+        np_data_type = np.float32
+    elif date_type == "int32":
+        np_data_type = np.int32
+    else:
+        raise ValueError("This data type is not support!")
+
+    return np_data_type
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank, indata=None):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        paddle.distributed.init_parallel_env()
+        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
+        place = fluid.MLUPlace(device_id)
+        np.random.seed(os.getpid())
+        np_data_type = DataTypeCast(args["data_type"])
+        indata = np.random.random((10, 1000)).astype(np_data_type)
+        if args['static_mode']:
+            result = self.get_model(train_prog, startup_prog, rank)
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            fetch_list = []
+            for elem in result:
+                fetch_list.append(elem.name)
+            out = exe.run(train_prog,
+                          feed={'tindata': indata},
+                          fetch_list=fetch_list)
+        else:
+            out = self.get_model(train_prog, startup_prog, rank, indata)
+            #print(out, sys.stderr)
+        sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    args["static_mode"] = int(os.getenv("STATIC_MODE"))
+    args["data_type"] = os.getenv("DATA_TYPE")
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_mlus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_mlus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            tr_cmd = "%s -m coverage run --branch -p %s"
+        else:
+            tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
+        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         data_type,
+                         path_id="0",
+                         static_mode="1",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
+            "STATIC_MODE": static_mode,
+            "PADDLE_WITH_GLOO": '0',
+            "BACKEND": "cncl",
+            "PATH_ID": path_id,
+            "DATA_TYPE": data_type
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np_data_type = DataTypeCast(data_type)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000)).astype(np_data_type)
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000)).astype(np_data_type)
+        if col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 4692c893d00b4595c1927f01c7e1b55dd6935c70..9c2e2205eb8761f93f4d16ab8934c9f1f256d92d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -270,5 +270,9 @@ class TestDistBase(unittest.TestCase):
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..95919f33328691269ce2ffcdfa9b771dd9c6cf96
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_cncl_fp16(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float16")
+
+    def test_broadcast_cncl_fp32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float32")
+
+    def test_broadcast_cncl_int32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3699da15b5356d2bf25341261be0c237e037ce5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid.layer_helper import LayerHelper
+from collections import OrderedDict
+
+
+def run_momentum_op(params,
+                    grads,
+                    velocitys,
+                    master_params,
+                    learning_rate,
+                    place,
+                    multi_precision,
+                    mu=0.9,
+                    rescale_grad=0.01,
+                    use_merged=False):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+        attrs = {
+            'mu': mu,
+            'multi_precision': multi_precision,
+            'rescale_grad': rescale_grad,
+        }
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
+def run_momentum_op2(params,
+                     grads,
+                     velocitys,
+                     master_params,
+                     learning_rate,
+                     place,
+                     multi_precision,
+                     mu=0.9,
+                     rescale_grad=0.01,
+                     use_merged=False,
+                     use_nesterov=True):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                attrs = {
+                    'mu': mu,
+                    'multi_precision': multi_precision,
+                    'rescale_grad': rescale_grad,
+                    'use_nesterov': use_nesterov,
+                    'regularization_method': 'l2_decay',
+                    'regularization_coeff': 2.0,
+                }
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            attrs = {
+                'mu': mu,
+                'multi_precision': multi_precision,
+                'rescale_grad': rescale_grad,
+                'use_nesterov': use_nesterov,
+                'regularization_method':
+                ['l2_decay' for i in range(len(param_vars))],
+                'regularization_coeff': [2.0 for i in range(len(param_vars))],
+            }
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
+class TestMergedMomentum(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            # MLU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0
+            return run_momentum_op(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged)
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+    def test_main(self):
+        self.check_with_place(self.place, multi_precision=False)
+
+
+class TestMergedMomentum2(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float32  # np.float16
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_nesterov, use_merged):
+            # MLU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0
+            return run_momentum_op2(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged,
+                use_nesterov=use_nesterov)
+
+        outs1 = run_op(use_nesterov=True, use_merged=True)
+        outs2 = run_op(use_nesterov=True, use_merged=False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+        outs3 = run_op(use_nesterov=False, use_merged=True)
+        outs4 = run_op(use_nesterov=False, use_merged=False)
+        self.assertEqual(len(outs3), len(outs4))
+        for j, (out3, out4) in enumerate(zip(outs3, outs4)):
+            self.assertTrue(np.allclose(out3, out4, atol=1e-7))
+
+    def test_main(self):
+        self.check_with_place(self.place, multi_precision=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..37a24885be1bf646fb7fac92a3d9cece5ef17cfe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupHCCL(store, rank, nranks)
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_nccl(self):
+        with _test_eager_guard():
+            paddle.set_device('npu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+            exit(0)
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+class TestProcessGroupFp16(TestProcessGroupFp32):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float16"
+        self.shape = (4, 20, 20)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2c6fae15eb4b19ba8d2dbe3e6c72a8c8ae9f4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestProcessGroup(TestMultipleGpus):
+    def test_process_group_nccl(self):
+        self.run_mnist_2gpu('process_group_hccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 628791afef5f66cd8eeddae7685d7a7ffdb6dd08..457f20ac5b06be0afb6929dc74148eb42b3ce4db 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -29,6 +29,7 @@ from copy import copy
 
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.framework import _dygraph_tracer
 import paddle.fluid.core as core
 from paddle.fluid.framework import _in_eager_mode
 from paddle.fluid.framework import _test_eager_guard
@@ -49,6 +50,7 @@ from paddle.fluid.tests.unittests.white_list import (
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 
 def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
@@ -395,6 +397,7 @@ class OpTest(unittest.TestCase):
             hasattr(self, "attrs") and "use_xpu" in self.attrs and
             self.attrs["use_xpu"] == True)
 
+    # set the self.output_dtype .
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
@@ -679,6 +682,133 @@ class OpTest(unittest.TestCase):
         else:
             return var_dict
 
+    def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
+        """ for quick verify, here we take a simplest strategy:
+                1. we only check variable in api_outs.
+                2. we simply check the numpy (tensor) .
+                3. we set atol and rtol as 1e-5, because they are unrelated to dtype.
+        """
+        for name in api_outs:
+            np_api = np.array(api_outs[name])
+            np_dyg = np.array(dygraph_outs[name])
+            self.assertTrue(
+                np.allclose(
+                    np_api, np_dyg, equal_nan=False),
+                "Output (" + name + ") has diff at " + str(place) + "\nExpect "
+                + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " +
+                self.__class__.__name__)
+
+    def _calc_python_api_output(self, place):
+        def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs,
+                                         kernel_sig):
+            """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
+            """
+
+            class Empty:
+                pass
+
+            def is_empty(a):
+                return isinstance(a, Empty)
+
+            def get_default(idx, all_params_number, defaults):
+                related_idx = idx - all_params_number + len(defaults)
+                assert related_idx >= 0, "%d-th arguments don't have default value" % idx
+                return defaults[related_idx]
+
+            def remove_name(x):
+                if isinstance(x, list): return [i for i in x if i != 'name']
+                if isinstance(x, dict):
+                    return {k: v for k, v in x.items() if k != 'name'}
+                assert False, "Only support list or dict."
+
+            def to_defaults_list(params, defaults):
+                return [defaults[p] for p in params if p in defaults]
+
+            # NOTE(xiongkun): why don't use input arguments dicts ? 
+            # Because we don't know the python api name of each arguments.
+            # using parse_arg_and_kwargs, we can get the all api information we need.
+            api_params, api_defaults = [
+                remove_name(item) for item in parse_arg_and_kwargs(api)
+            ]
+            api_defaults = to_defaults_list(api_params, api_defaults)
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            inputs_and_attrs = inputs_sig + attrs_sig
+            assert (
+                len(api_params) == len(inputs_and_attrs)
+            ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)"
+            input_arguments = [op_proto_ins[name] for name in inputs_sig] + [
+                op_proto_attrs[name] if name in op_proto_attrs else Empty()
+                for name in attrs_sig
+            ]
+            results = []
+            for idx, arg in enumerate(input_arguments):
+                if is_empty(arg):
+                    results.append(
+                        get_default(idx, len(input_arguments), api_defaults))
+                else:
+                    results.append(arg)
+            return results
+
+        def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
+            if not isinstance(ret_tuple, (tuple, list)):
+                ret_tuple = [ret_tuple]
+            assert len(output_sig) == len(
+                ret_tuple), "expect %d outputs, but get %d outputs" % (
+                    len(output_sig), len(ret_tuple))
+            return {a: b for a, b in zip(output_sig, ret_tuple)}
+
+        def assumption_assert_and_transform(args, inp_num):
+            """
+            transform inputs by the following rules:
+                1. [Tensor] -> Tensor
+                2. [Tensor, Tensor, ...] -> list of Tensors
+
+            only support "X" is list of Tensor, currently don't support other structure like dict.
+            """
+            for inp in args[:inp_num]:
+                assert isinstance(
+                    inp, list
+                ), "currently only support `X` is [Tensor], don't support other structure."
+            args = [
+                inp[0] if len(inp) == 1 else inp for inp in args[:inp_num]
+            ] + args[inp_num:]
+            return args
+
+        def cal_python_api(python_api, args, kernel_sig):
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            args = assumption_assert_and_transform(args, len(inputs_sig))
+            ret_tuple = python_api(*args)
+            return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
+
+        with fluid.dygraph.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+            op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+            # prepare input variable
+            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
+                                                          True, False, block)
+            # prepare output variable
+            outputs = self.append_input_output_for_dygraph(
+                op_proto, self.outputs, False, False, block)
+
+            # prepare attrbutes
+            attrs_outputs = {}
+            if hasattr(self, "attrs"):
+                for attrs_name in self.attrs:
+                    if self.attrs[attrs_name] is not None:
+                        attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
+            kernel_sig = _dygraph_tracer()._get_kernel_signature(
+                self.op_type, inputs, outputs, attrs_outputs)
+
+            assert hasattr(
+                self, "python_api"
+            ), "Please set the `self.python_api` if you want to compare python api output."
+            args = prepare_python_api_arguments(self.python_api, inputs,
+                                                attrs_outputs, kernel_sig)
+            """ we directly return the cal_python_api value because the value is already tensor. 
+            """
+            return cal_python_api(self.python_api, args, kernel_sig)
+
     def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
         with fluid.dygraph.base.guard(place=place):
@@ -699,6 +829,7 @@ class OpTest(unittest.TestCase):
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
             block.append_op(
                 type=self.op_type,
                 inputs=inputs,
@@ -1150,10 +1281,17 @@ class OpTest(unittest.TestCase):
         if check_dygraph:
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
+
         if check_eager:
             with _test_eager_guard():
                 eager_dygraph_outs = self._calc_dygraph_output(
                     place, no_check_set=no_check_set)
+            # we only check end2end api when check_eager=True
+            if hasattr(self, "python_api"):
+                api_outs = self._calc_python_api_output(place)
+                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
+                                                     place)
+
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff68a1ce0d69307547db2fd1f83526094c9bfcf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+import numpy as np
+import random
+
+import paddle
+import paddle.nn as nn
+from paddle.fluid.dygraph.nn import Linear
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+import paddle.distributed as dist
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.optimizer import SGD
+from paddle.fluid.initializer import NumpyArrayInitializer
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks)
+    group = core.ProcessGroupNCCL(store, rank, nranks)
+    return group
+
+
+class LinearModel(nn.Layer):
+    def __init__(self, attr_list):
+        super(LinearModel, self).__init__()
+        self._linear1 = paddle.nn.Linear(
+            50, 30, weight_attr=attr_list[0], bias_attr=False)
+        self._linear2 = paddle.nn.Linear(
+            30, 10, weight_attr=attr_list[1], bias_attr=False)
+        self._linear3 = paddle.nn.Linear(
+            10, 10, weight_attr=attr_list[2], bias_attr=False)
+
+    def forward(self, x):
+        output = self._linear1(x)
+        output = self._linear2(output)
+        output = self._linear3(output)
+        return output
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        process_group = init_process_group()
+        self.generate_reducer("float32", process_group)
+        self.generate_reducer("float16", process_group)
+
+    def generate_reducer(self, dtype, process_group):
+        dev_id = ParallelEnv().dev_id
+        np.random.seed(2022 + dev_id)
+        paddle.set_default_dtype(dtype)
+
+        w_1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(50, 30).astype(dtype)))
+        w_2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(30, 10).astype(dtype)))
+        w_3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(
+            np.random.rand(10, 10).astype(dtype)))
+
+        attr_list = [w_1, w_2, w_3]
+        inp = np.random.rand(10, 50).astype(dtype)
+
+        # original reducer
+        params_a = self.model_train(attr_list, inp)
+
+        # refactored reducer in eager mode
+        with _test_eager_guard():
+            params_b = self.model_train(
+                attr_list, inp, process_group=process_group)
+
+        for i in range(len(params_a)):
+            np.testing.assert_allclose(params_a[i].numpy(), params_b[i].numpy())
+
+    def model_train(self, attr_list, inp, process_group=None):
+        model = LinearModel(attr_list)
+        model = paddle.DataParallel(model, process_group=process_group)
+        optimizer = SGD(learning_rate=0.0003, parameters=model.parameters())
+
+        x = paddle.to_tensor(inp)
+        x.stop_gradient = False
+
+        for step in range(10):
+            y = model(x)
+            loss = y.mean()
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+        return model.parameters()
+
+
+class TestCatchErrors1(unittest.TestCase):
+    def test_multiple_gpus(self):
+        linear = paddle.nn.Linear(2, 4)
+        with _test_eager_guard():
+            self.assertRaises(RuntimeError, paddle.DataParallel, linear)
+
+
+class TestCatchErrors2(unittest.TestCase):
+    def test_multiple_gpus(self):
+        with _test_eager_guard():
+            linear = paddle.nn.Linear(2, 4)
+            self.assertRaises(RuntimeError, paddle.DataParallel, linear)
+
+
+if __name__ == '__main__':
+    dist.init_parallel_env()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c62c4615f74707796946137d3b44efc3cc8aeee9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_gloo(self):
+        with _test_eager_guard():
+            nranks = ParallelEnv().nranks
+            rank = ParallelEnv().local_rank
+            is_master = True if rank == 0 else False
+            store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
+                                               nranks, datetime.timedelta(0))
+            gloo_store = paddle.fluid.core.GlooStore(store)
+            opt = paddle.fluid.core.GlooOptions()
+            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+
+            # test allreduce sum
+            # rank 0
+            paddle.device.set_device('cpu')
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = x + y
+            if rank == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if rank == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if rank == 0:
+                task = pg.broadcast(tensor_x, 0)
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                assert np.array_equal(broadcast_result, tensor_y)
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 8ec5d13c569fe85fab0f67a7961e183a2084dcf7..b1da0777feb3de1b1d6bb59a868802f736afb8e7 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -27,22 +27,13 @@ import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
-ProcessGroupStrategy = core.ProcessGroupStrategy
-
 
 def init_process_group(strategy=None):
-    # this will remove
-    if strategy is None:
-        strategy = ProcessGroupStrategy()
-        strategy.nranks = ParallelEnv().nranks
-        strategy.local_rank = ParallelEnv().local_rank
-        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-        strategy.current_endpoint = ParallelEnv().current_endpoint
-    if strategy.nranks < 2:
-        return
-
-    pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank,
-                                     strategy.nranks)
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupNCCL(store, rank, nranks)
 
     return pg_group
 
@@ -144,23 +135,109 @@ class TestProcessGroupFp32(unittest.TestCase):
 
             print("test barrier api ok\n")
 
-            # test send/recv
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
             # rank 0
             x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
             tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
             if pg.rank() == 0:
-                task = pg.send(tensor_x, dst=1)
+                task = pg.scatter(tensor_x, tensor_y, 0)
                 task.wait()
                 paddle.device.cuda.synchronize()
             # rank 1
             else:
-                y = np.random.random(self.shape).astype(self.dtype)
-                tensor_y = paddle.to_tensor(y)
-                task = pg.recv(tensor_y, src=0)
+                task = pg.scatter(tensor_x, tensor_y, 0)
                 task.wait()
                 paddle.device.cuda.synchronize()
-                assert np.array_equal(tensor_x, tensor_y)
-                print("test send/recv api ok\n")
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
 
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index bc87fc255a59bf1155a4d046c8e0ce68fb1a54f0..0fd64b0d923051cb3d3b9bf9eb9388d062a54ff0 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -382,6 +382,7 @@ class DnnTrainer(object):
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
             ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
                                          user_defined_strategy)
+            ps_optimizer._set_origin_programs([loss])
             ps_optimizer._init_ps_pass_context(loss, startup_program)
             _main = ps_optimizer.pass_ctx._attrs['cloned_main']
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57f26776234eb65a57cc65df2ccd5a6a38a2144
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+import paddle.incubate.nn.functional as incubate_f
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import core
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedMultiHeadAttention(Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout_rate=0.5,
+                 attn_dropout_rate=0.5,
+                 kdim=None,
+                 vdim=None,
+                 normalize_before=False,
+                 need_weights=False,
+                 qkv_weight_attr=None,
+                 qkv_bias_attr=None,
+                 linear_weight_attr=None,
+                 linear_bias_attr=None,
+                 pre_ln_scale_attr=None,
+                 pre_ln_bias_attr=None,
+                 ln_scale_attr=None,
+                 ln_bias_attr=None,
+                 epsilon=1e-5,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedMultiHeadAttention, self).__init__()
+
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+                               "but recieved {}".format(num_heads))
+
+        self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.kdim = kdim
+        self.vdim = vdim
+        self.need_weights = need_weights
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert need_weights == False, "Only support need_weight is False now."
+
+        # tensor model parallel
+        assert num_heads % nranks == 0
+        num_heads = num_heads // nranks
+
+        self.qkv_weight = self.create_parameter(
+            shape=[3, num_heads, self.head_dim, embed_dim],
+            attr=qkv_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.qkv_bias = self.create_parameter(
+            shape=[3, num_heads, self.head_dim],
+            attr=qkv_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.linear_weight = self.create_parameter(
+            shape=[num_heads * self.head_dim, embed_dim],
+            attr=linear_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self.qkv_weight)
+            _set_var_distributed(self.qkv_bias)
+            # row parallel
+            _set_var_distributed(self.linear_weight)
+
+        if normalize_before:
+            self.pre_ln_scale = self.create_parameter(
+                attr=pre_ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.ln_scale = None
+            self.ln_bias = None
+        else:
+            self.pre_ln_scale = None
+            self.pre_ln_bias = None
+            self.ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+
+        self.dropout_rate = dropout_rate
+        self.attn_dropout_rate = attn_dropout_rate
+
+        self.name = name
+
+    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+        out = incubate_f.fused_multi_head_attention(
+            x=query,
+            qkv_weight=self.qkv_weight,
+            linear_weight=self.linear_weight,
+            pre_layer_norm=self.normalize_before,
+            pre_ln_scale=self.pre_ln_scale,
+            pre_ln_bias=self.pre_ln_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            pre_ln_epsilon=self._epsilon,
+            qkv_bias=self.qkv_bias,
+            linear_bias=self.linear_bias,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_rate,
+            attn_dropout_rate=self.attn_dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+n_head = 2 * MODEL_PARALLEL_SIZE
+d_key = 4
+hidden = n_head * d_key
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    pre_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    pre_ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    qkv_w = np.random.uniform(
+        -1, 1, size=(3, n_head, d_key, hidden)).astype(DTYPE)
+    qkv_b = np.random.uniform(-1, 1, size=(3, n_head, d_key)).astype(DTYPE)
+    linear_w = np.random.uniform(
+        -1, 1, size=(n_head * d_key, hidden)).astype(DTYPE)
+    linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else n_head // MODEL_PARALLEL_SIZE
+        end = start + n_head // MODEL_PARALLEL_SIZE
+        col_qkv_w = qkv_w[:, start:end, :, :]
+        col_qkv_b = qkv_b[:, start:end, :]
+        row_linear_w = linear_w[(start * d_key):(end * d_key), :]
+
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        result = attn(data)
+    else:
+        pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
+
+        attn = ParallelFusedMultiHeadAttention(
+            hidden,
+            n_head,
+            dropout_rate=0.0,
+            attn_dropout_rate=0.0,
+            normalize_before=False,
+            qkv_weight_attr=qkv_w_attr,
+            qkv_bias_attr=qkv_b_attr,
+            linear_weight_attr=linear_w_attr,
+            linear_bias_attr=linear_b_attr,
+            pre_ln_scale_attr=pre_ln_w_attr,
+            pre_ln_bias_attr=pre_ln_b_attr,
+            ln_scale_attr=pre_ln_w_attr,
+            ln_bias_attr=pre_ln_b_attr)
+        result = attn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, hidden]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f467da6a6465467dbf0c64122b6933df92a4cbc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -0,0 +1,384 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+
+def fused_feedforward(x,
+                      linear1_weight,
+                      linear2_weight,
+                      linear1_bias=None,
+                      linear2_bias=None,
+                      ln1_scale=None,
+                      ln1_bias=None,
+                      ln2_scale=None,
+                      ln2_bias=None,
+                      dropout1_rate=0.5,
+                      dropout2_rate=0.5,
+                      activation="relu",
+                      ln1_epsilon=1e-5,
+                      ln2_epsilon=1e-5,
+                      pre_layer_norm=False,
+                      training=True,
+                      mode='upscale_in_train',
+                      ring_id=-1,
+                      name=None):
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    helper = LayerHelper("fused_feedforward")
+    dtype = x.dtype
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'fused_feedforward')
+    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                'fused_feedforward')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    dropout1_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    dropout2_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+            'dropout1_is_test': not training,
+            'dropout2_is_test': not training,
+            'dropout1_fix_seed': seed is not None,
+            'dropout2_fix_seed': seed is not None,
+            'dropout1_seed': seed if seed is not None else 0,
+            'dropout2_seed': seed if seed is not None else 0,
+            'dropout1_implementation': mode,
+            'dropout2_implementation': mode,
+            'ring_id': ring_id,
+        })
+    return out
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedFeedForward(Layer):
+    def __init__(self,
+                 d_model,
+                 dim_feedforward,
+                 dropout_rate=0.1,
+                 epsilon=1e-05,
+                 activation="relu",
+                 act_dropout_rate=None,
+                 normalize_before=False,
+                 linear1_weight_attr=None,
+                 linear1_bias_attr=None,
+                 linear2_weight_attr=None,
+                 linear2_bias_attr=None,
+                 ln1_scale_attr=None,
+                 ln1_bias_attr=None,
+                 ln2_scale_attr=None,
+                 ln2_bias_attr=None,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedFeedForward, self).__init__()
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, but recieved {}".format(
+                d_model))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            format(dim_feedforward))
+
+        self._dtype = self._helper.get_default_dtype()
+        self._d_model = d_model
+
+        assert dim_feedforward % nranks == 0
+        dim_feedforward = dim_feedforward // nranks
+        self._dim_feedforward = dim_feedforward
+        self._dropout_rate = dropout_rate
+        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_method = activation
+        self._normalize_before = normalize_before
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self._linear1_weight = self.create_parameter(
+            shape=[d_model, dim_feedforward],
+            attr=linear1_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        self._linear2_weight = self.create_parameter(
+            shape=[dim_feedforward, d_model],
+            attr=linear2_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self._linear1_weight)
+            _set_var_distributed(self._linear1_bias)
+            _set_var_distributed(self._linear2_weight)
+
+        if normalize_before:
+            self._ln1_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln1_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
+            self._ln2_scale = None
+            self._ln2_bias = None
+        else:
+            self._ln1_bias = None
+            self._ln2_bias = None
+            self._ln2_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln2_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
+
+        self.name = name
+
+    def forward(self, src, cache=None):
+        out = fused_feedforward(
+            src,
+            self._linear1_weight,
+            self._linear2_weight,
+            self._linear1_bias,
+            self._linear2_bias,
+            self._ln1_scale,
+            self._ln1_bias,
+            self._ln2_scale,
+            self._ln2_bias,
+            dropout1_rate=self._act_dropout_rate,
+            dropout2_rate=self._dropout_rate,
+            activation=self._act_method,
+            ln1_epsilon=self._epsilon,
+            ln2_epsilon=self._epsilon,
+            pre_layer_norm=self._normalize_before,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    ln_w = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    ln_b = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    w0 = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    b0 = np.random.uniform(-1, 1, size=(OUT_SIZE, )).astype(DTYPE)
+    w1 = np.random.uniform(-1, 1, size=(OUT_SIZE, IN_SIZE)).astype(DTYPE)
+    b1 = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else OUT_SIZE // MODEL_PARALLEL_SIZE
+        end = start + OUT_SIZE // MODEL_PARALLEL_SIZE
+        col_w0 = w0[:, start:end]
+        col_b0 = b0[start:end]
+        row_w1 = w1[start:end, :]
+
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
+        w1_attr, b1_attr = get_param_attr(row_w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        #ffn.eval()
+        result = ffn(data)
+    else:
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(w0, b0)
+        w1_attr, b1_attr = get_param_attr(w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr)
+        #ffn.eval()
+        result = ffn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index d3d8fdd703148c184b6a06c5382655c3ed5ba044..5c40b898d2325ba97c42807ed77be91dc76aa623 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -183,6 +183,34 @@ class TestSigmoid(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSigmoidBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+        out = 1 / (1 + np.exp(-x))
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestSilu(TestActivation):
     def setUp(self):
         self.op_type = "silu"
@@ -945,6 +973,34 @@ class TestSqrt(TestActivation, TestParameter):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSqrtBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        np.random.seed(1023)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.sqrt(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestRsqrt(TestActivation):
     def setUp(self):
         self.op_type = "rsqrt"
@@ -983,7 +1039,7 @@ class TestAbs(TestActivation):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestCeil(TestActivation):
@@ -2195,6 +2251,34 @@ class TestSquare(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSquareBF16(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.square(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5)
+
+
 class TestPow(TestActivation):
     def setUp(self):
         self.op_type = "pow"
@@ -2433,6 +2517,35 @@ class TestSoftplus(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftplusBF16(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.init_dtype()
+
+        beta = 2
+        threshold = 15
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [10, 12]).astype(np.float32)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'beta': beta, "threshold": threshold}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.05)
+
+
 class TestSoftplusAPI(unittest.TestCase):
     # test paddle.nn.Softplus, paddle.nn.functional.softplus
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 4552d600bafd748633b45b9d25293026c5b0cf2e..2b281d7d6f7c5f186fd73ab3152a2e7652ae6ce1 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
         for k, v in self.get_strategy().items():
             setattr(build_strategy, k, v)
         self.check_before_applied(main2, startup2)
+
         apply_build_strategy(main2, startup2, build_strategy,
                              {"use_cuda": self.use_cuda})
         self.check_after_applied(main2, startup2)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index cce13a8bf3b74a7641710e853ff6c48e86ccba63..b02df024518a86fd152b89ae890b92f0f6df3b32 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
 
     def test_check_output(self):
         places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 
     def test_check_output(self):
         places = []
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 places.append(place)
-
         for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            #for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
                 self.check_with_place(place, data_format, self.dtype,
                                       [2, 3, 4, 5])
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
         places = [core.CPUPlace()]
 
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 6a6f85a48320681b430ab9d6a9363c28cf5c912e..c9abac8fb7946d51987645e11673f5d64a06c4ce 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -28,7 +28,7 @@ import paddle
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             with fluid.dygraph.guard(p):
@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_error(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             #paddle.disable_static()
@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
         else:
             paddle.set_default_dtype("float64")
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
 
     def tearDown(self):
@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
 class TestBatchNormUseGlobalStats(unittest.TestCase):
     def setUp(self):
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
         self.init_test()
 
@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index 6ae5424a882daea54145a31612f61909871fe05c..58baa0a2fa9443289f24a7e2f23e18fae4877f95 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_nccl.py')
 
+    def test_process_group_gloo(self):
+        self.run_mnist_2gpu('process_group_gloo.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 10b7e13dcc334dbc6b2f7b4c614cf888168c34ab..4feca1b92505b60713df245578896a9880b7cf06 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -25,6 +25,7 @@ import paddle
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
+        self.python_api = paddle.concat
         self.dtype = self.get_dtype()
         self.init_test_data()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index dc460cb16f68c14df2cd7f7f087c602b945ffc7d..ca77177125fcdddf198e6783939bf84b4ccd9b0e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index f933d5bf7a48f14b0f4cb4f7ce274744f28c4c24..892fa649a6c5b31b54db8204acc76a7cc8794136 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
 import unittest
+import paddle
 
 
 def _reverse_repeat_list(t, n):
@@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 8ea4e369d323619db35ebf41d4dd053fc5ffe4d9..6a9f7a47f66cce3879e813836745b3e609affd50 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle
 import paddle.fluid.core as core
@@ -603,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp):
         self.groups = 3
 
 
-#----------------Conv2DCUDNN----------------
+# #----------------Conv2DCUDNN----------------
 
 create_test_cudnn_class(TestConv2DOp)
 create_test_cudnn_class(TestWithPad)
@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
     TestWithDilation_AsyPadding, grad_check=False)
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 5f23d04dde51cc21e66098ee6e37027bf82d7537..8cf779ccfdd4292f4cb6cbe74bf58b8ee7b37411 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 def conv3d_forward_naive(input,
@@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 81c6aa1fd17d9ad16a1d24f32e5f55b3b71ca629..784d89b93f9859253a5722232954e7db1080afed 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase):
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
-        places = []
 
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
             [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
-        places = [fluid.CPUPlace()]
+        #places = [fluid.CPUPlace()]
+        places = []
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
@@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index bc280a01890d4a54f76026ccee31666c5f0ff2a8..83a25b71626e1b84ae0f85eeccee5423205dc978 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle
 import paddle.fluid.dygraph as dg
 from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestTensorBackward(unittest.TestCase):
@@ -29,7 +30,7 @@ class TestTensorBackward(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_tensor_backward(self):
+    def func_tensor_backward(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 100]).astype(dtype)
             y = np.random.random([100, 2]).astype(dtype)
@@ -48,6 +49,11 @@ class TestTensorBackward(unittest.TestCase):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
+    def test_tensor_backward(self):
+        with _test_eager_guard():
+            self.func_tensor_backward()
+        self.func_tensor_backward()
+
 
 class TestBackwardAPI(unittest.TestCase):
     def setUp(self):
@@ -56,7 +62,7 @@ class TestBackwardAPI(unittest.TestCase):
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_backward_api(self):
+    def func_backward_api(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -78,7 +84,12 @@ class TestBackwardAPI(unittest.TestCase):
                     self.assertTrue(
                         np.allclose(x_grad * 2, x_tensor.grad.numpy()))
 
-    def test_backward_single_tensor(self):
+    def test_backward_api(self):
+        with _test_eager_guard():
+            self.func_backward_api()
+        self.func_backward_api()
+
+    def func_backward_single_tensor(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -97,7 +108,12 @@ class TestBackwardAPI(unittest.TestCase):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
-    def test_backward_none_grad_tensor(self):
+    def test_backward_single_tensor(self):
+        with _test_eager_guard():
+            self.func_backward_single_tensor()
+        self.func_backward_single_tensor()
+
+    def func_backward_none_grad_tensor(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 2]).astype(dtype)
             y = np.random.random([2, 2]).astype(dtype)
@@ -115,7 +131,12 @@ class TestBackwardAPI(unittest.TestCase):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
-    def test_backward_accumulator_with_init_grad(self):
+    def test_backward_none_grad_tensor(self):
+        with _test_eager_guard():
+            self.func_backward_none_grad_tensor()
+        self.func_backward_none_grad_tensor()
+
+    def func_backward_accumulator_with_init_grad(self):
         for dtype in self._dtypes:
             x = np.random.random([10, ]).astype(dtype)
             y_grad = np.random.random([10, ]).astype(dtype)
@@ -134,11 +155,14 @@ class TestBackwardAPI(unittest.TestCase):
 
                     y = x**2
                     z = x**3
-                    x_grad = 2 * x_tensor * (
-                        y_grad_tensor + 3 * y_tensor * y_tensor * z_grad_tensor)
+                    x_grad = 2 * x * (y_grad + 3 * y * y * z_grad)
 
-                    self.assertTrue(
-                        np.allclose(x_grad.numpy(), x_tensor.grad.numpy()))
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
+
+    def test_backward_accumulator_with_init_grad(self):
+        with _test_eager_guard():
+            self.func_backward_accumulator_with_init_grad()
+        self.func_backward_accumulator_with_init_grad()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
index 0371fa054282bb009889c90c9de4da58894fad8f..9f727608f816c4e818f50f12d4d5cc1fccf04bdb 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -44,6 +44,10 @@ class TestDiagV2Op(OpTest):
         paddle.enable_static()
         self.check_output(check_eager=True)
 
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out', check_eager=True)
+
     def init_config(self):
         pass
 
@@ -62,14 +66,14 @@ class TestDiagV2OpCase2(TestDiagV2Op):
 
 class TestDiagV2OpCase3(TestDiagV2Op):
     def init_config(self):
-        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.x = np.random.randint(-10, 10, size=(10, 10)).astype("float64")
         self.out = np.diag(self.x, self.offset)
 
 
 class TestDiagV2OpCase4(TestDiagV2Op):
     def init_config(self):
         self.x = np.random.rand(100)
-        self.padding_value = 8
+        self.padding_value = 2
         n = self.x.size
         self.out = self.padding_value * np.ones((n, n)) + np.diag(
             self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index 4dab7c0df40763ac48d3a685b22edc7fba143ae9..b4854aea52a70bd5307193377c85ab229d949e1a 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -124,6 +124,25 @@ class TestDiagonalAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager(self):
+        paddle.disable_static(self.place)
+        with _test_eager_guard():
+            x_tensor = paddle.to_tensor(self.x)
+            out = paddle.diagonal(x_tensor)
+            out2 = paddle.diagonal(x_tensor, offset=0, axis1=2, axis2=1)
+            out3 = paddle.diagonal(x_tensor, offset=1, axis1=0, axis2=1)
+            out4 = paddle.diagonal(x_tensor, offset=0, axis1=1, axis2=2)
+        out_ref = np.diagonal(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
+        out2_ref = np.diagonal(self.x, offset=0, axis1=2, axis2=1)
+        self.assertEqual(np.allclose(out2.numpy(), out2_ref, rtol=1e-08), True)
+        out3_ref = np.diagonal(self.x, offset=1, axis1=0, axis2=1)
+        self.assertEqual(np.allclose(out3.numpy(), out3_ref, rtol=1e-08), True)
+        out4_ref = np.diagonal(self.x, offset=0, axis1=1, axis2=2)
+        self.assertEqual(np.allclose(out4.numpy(), out4_ref, rtol=1e-08), True)
+
+        paddle.enable_static()
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_api_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index f670f7c38097b39723e22d15d152590fa19c607d..fd2f642b770d646e74168800bbe8820534581354 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -933,5 +933,65 @@ class TestDropoutWithDeterminateSeedGenerator(unittest.TestCase):
             self.check_static_result(place=place)
 
 
+class TestDropoutBackward(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def cal_grad_upscale_train(self, mask, prob):
+        return mask.astype("float32") / (1 - prob)
+
+    def cal_grad_downscale_in_infer(self, mask):
+        return mask.astype("float32")
+
+    def test_backward_downscale_in_infer(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', 0.5)
+                out.backward()
+
+                self.assertTrue(
+                    np.array_equal(input.gradient(
+                    ), self.cal_grad_downscale_in_infer(mask.numpy())))
+
+    def test_backward_upscale_train(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.5
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+    def test_backward_upscale_train_2(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.3
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c296c7e40e981c63ac8fad76b1c3d3b5b0d9098
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphGroupSharded(TestMultipleGpus):
+
+    # check group sharded logic as well as the accuracy with single mode
+    def test_dygraph_group_sharded(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_api.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6a5d60ecae46c6b53b005963a26d92297607f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard, Variable
+from paddle.fluid import core
+from paddle.fluid.layers.utils import _hash_with_id
+import paddle.compat as cpt
+
+import unittest
+
+
+def _append_backward_desc(main_program, outs):
+    # make sure all status of is_test are False in train mode.
+    program = main_program.clone()
+    targets = []
+    for out in outs:
+        if isinstance(out, Variable):
+            targets.append(program.global_block().var(out.name))
+
+    if targets:
+        paddle.fluid.backward.gradients(targets=targets, inputs=[])
+
+    return program
+
+
+# def _set_grad_type(params, train_program):
+#     # NOTE: if user set sparse gradient mode, the param's gradient
+#     # will be SelectedRows, not LoDTensor. But tracer will just
+#     # set param grad VarBase by forward VarBase(LoDTensor)
+#     # If we don't change grad_var type here, RunProgramOp need
+#     # transform SelectedRows to LoDTensor forcibly, it may not
+#     # be user wanted result.
+#     for param in params:
+#         grad_name = param.name + core.grad_var_suffix()
+#         grad_var = train_program.desc.block(0).find_var(
+#             cpt.to_bytes(grad_name))
+#         # NOTE: cannot find var desc maybe no problem, such as in batch_norm
+#         if grad_var is None:
+#             continue
+#         param._set_grad_type(grad_var.type())
+
+
+def _create_out(var):
+    assert isinstance(var, Variable)
+    var_desc = var.desc
+    varbase = None
+    if not core._in_eager_mode():
+        var_base = core.VarBase(var_desc.dtype(),
+                                var_desc.shape(),
+                                var_desc.name(), var_desc.type(), False)
+    else:
+        var_base = core.eager.Tensor(var_desc.dtype(),
+                                     var_desc.shape(),
+                                     var_desc.name(), var_desc.type(), False)
+    return var_base
+
+
+class TestRunProgram(unittest.TestCase):
+    def test_eager(self):
+        paddle.set_device('cpu')
+        paddle.enable_static()
+        # step 1: construct program
+        x = paddle.static.data(shape=[2, 4], name='x')
+        x.stop_gradient = False
+        y = paddle.static.data(shape=[4, 2], name='y')
+        y.stop_gradient = False
+        out = paddle.matmul(x, y)
+
+        main_program = paddle.static.default_main_program()
+        program = _append_backward_desc(main_program, [out])
+
+        paddle.disable_static('cpu')
+        # step 2: call run_program in eager mode
+        with _test_eager_guard():
+            x_t = paddle.ones([2, 4])
+            x_t.name = "x"
+            x_t.stop_gradient = False
+            y_t = paddle.ones([4, 2])
+            y_t.name = "y"
+            y_t.stop_gradient = False
+
+            fake_var = paddle.zeros([1])
+            fake_var.name = 'Fake_var'
+
+            out_t = _create_out(out)
+
+            scope = core.Scope()
+            attrs = ('global_block', program.desc.block(0), 'start_op_index', 0,
+                     'end_op_index', main_program.desc.block(0).op_size(),
+                     'is_test', False, 'program_id', _hash_with_id(program))
+
+            _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
+                               [fake_var], *attrs)
+
+            loss = paddle.mean(out_t)
+            loss.backward()
+
+            self.assertTrue(np.array_equal(np.ones([2, 2]) * 4, out_t.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([2, 4]) * 0.5, x_t.grad.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([4, 2]) * 0.5, y_t.grad.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 252482fa6d270edbc1bec3a0d6023933521d7f7e..27aec284de4cdebb5ebb9191bfb67d48c1b327f5 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -50,7 +50,7 @@ class EagerScaleTestCase(unittest.TestCase):
             data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertFalse(data_eager.grad._is_initialized())
+            self.assertIsNone(data_eager.grad)
             out_eager.backward(grad_eager, False)
             self.assertTrue(data_eager.grad._is_initialized())
             self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
@@ -72,7 +72,7 @@ class EagerScaleTestCase(unittest.TestCase):
             data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertFalse(data_eager.grad._is_initialized())
+            self.assertIsNone(data_eager.grad)
             with self.assertRaisesRegexp(
                     AssertionError,
                     "The type of grad_tensor must be paddle.Tensor"):
@@ -632,13 +632,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
             tensor2.persistable = True
             tensor2.stop_gradient = False
             if core.is_compiled_with_cuda():
-                tensor3 = tensor2._copy_to(True, core.CUDAPlace(0))
+                tensor3 = tensor2._copy_to(core.CUDAPlace(0), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_gpu_place())
             else:
-                tensor3 = tensor2._copy_to(True, core.CPUPlace())
+                tensor3 = tensor2._copy_to(core.CPUPlace(), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
             self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
             ori_place = egr_tensor.place
 
-            new_arr = np.random.rand(4, 4, 16, 32).astype('float32')
+            new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
             self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
 
-            egr_tensor._set_value(new_arr)
+            egr_tensor.set_value(new_arr)
             self.assertEqual(egr_tensor.stop_gradient, True)
             self.assertTrue(egr_tensor.place._equals(ori_place))
-            self.assertEqual(egr_tensor.shape, [4, 4, 16, 32])
+            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
             self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
 
 
@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
             new_weight = np.ones([1, 3]).astype('float32')
             self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
 
-            linear.weight._set_value(new_weight)
+            linear.weight.set_value(new_weight)
             self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
             self.assertTrue(linear.weight.place._equals(ori_place))
 
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index aee6ca249f535b9c06c00a6806ac491be16cd4b3..a204c26c1b823fa228dba63dc8351db0adf31708 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 5bc2d1cda180bac91ada2a041903888b94e26827..95537d4332739be13ee1705dafeea6d3f0ac2762 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -21,7 +21,7 @@ from paddle.fluid import Program, program_guard
 import paddle.compat as cpt
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 
 class TestFillAnyLikeOp(OpTest):
@@ -47,6 +47,25 @@ class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
         self.value = 0.0
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillAnyLikeOpBfloat16(OpTest):
+    def setUp(self):
+        self.op_type = "fill_any_like"
+        self.dtype = np.uint16
+        self.value = 0.0
+        self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)}
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {
+            'Out':
+            convert_float_to_uint16(self.value * np.ones_like(self.inputs["X"]))
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
     def init(self):
         self.value = 1.0
@@ -79,19 +98,6 @@ class TestFillAnyLikeOpType(TestFillAnyLikeOp):
         }
 
 
-class TestFillAnyLikeOpOverflow(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e100
-
-    def test_check_output(self):
-        exception = None
-        try:
-            self.check_output(check_dygraph=False)
-        except ValueError as ex:
-            exception = ex
-        self.assertIsNotNone(exception)
-
-
 class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
     def init(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 822c952893e11861cb1febf2d29b09e73ffa486e..15071b2b6aa69fe95eb40acc9db3c94ee81c0256 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -83,6 +83,27 @@ class TestFillConstantOp4(OpTest):
         self.check_output()
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillConstantBF16Op(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.dtype = np.uint16
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 3.8,
+            'dtype': core.VarDesc.VarType.BF16
+        }
+        self.outputs = {'Out': convert_float_to_uint16(np.full((123, 92), 3.8))}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 14a59b413383f81959e9854d5735a87f7ff728cc..44b94cd3b66eec55841760a14fd95df0e930f36e 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -174,6 +174,15 @@ class TestFoldOpError(unittest.TestCase):
                     x, output_sizes=[6, 6], kernel_sizes=[2, 2],
                     strides=[1, 1])
 
+            def test_output_size_2():
+                # out_size must GT 1
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[0.1, 0.2],
+                    kernel_sizes=[2, 2],
+                    strides=[1, 1])
+
             def test_block_h_w():
                 # test_block_h_w GT 0
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
@@ -196,6 +205,7 @@ class TestFoldOpError(unittest.TestCase):
             self.assertRaises(AssertionError, test_dilations_shape)
             self.assertRaises(AssertionError, test_strides_shape)
             self.assertRaises(ValueError, test_output_size)
+            self.assertRaises(ValueError, test_output_size_2)
             self.assertRaises(ValueError, test_block_h_w)
             self.assertRaises(ValueError, test_GT_0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index be6abb17c3c316d30bdcfa5539ecd1e2549280a5..3ae2e9ff6bdaf7d5a4161eb70914191ace7df417 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -62,6 +62,15 @@ class TestFullOp(unittest.TestCase):
         self.assertTrue((out.numpy() == out_numpy).all(), True)
         paddle.enable_static()
 
+    def test_full_like_fill_inf(self):
+        paddle.disable_static()
+        input = paddle.arange(6, 10, dtype='float32')
+        out = paddle.full_like(input, fill_value=float('inf'))
+        out_numpy = np.random.random((4)).astype("float32")
+        out_numpy.fill(float('inf'))
+        self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.enable_static()
+
 
 class TestFullOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py
index d45ef528261f394ff54e94714aa40d345b9aa458..55981b01c408435d2824ca46dc62b9b4d0c15651 100644
--- a/python/paddle/fluid/tests/unittests/test_function_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_function_hook.py
@@ -20,6 +20,7 @@ import numpy as np
 
 import paddle.fluid.core as core
 from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestCapture:
@@ -41,7 +42,7 @@ def grad_hook(grad):
 
 
 class TestBakcwardFunctionHookError(unittest.TestCase):
-    def test_hook(self):
+    def func_hook(self):
         input_data = np.ones([4, 4]).astype('float32')
 
         x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False)
@@ -58,6 +59,12 @@ class TestBakcwardFunctionHookError(unittest.TestCase):
 
         assert test_cap.list == [1, 2, 1]
 
+    def test_hook(self):
+        # _register_void_function_post_hook do not support in eager mode
+        with _test_eager_guard():
+            pass
+        self.func_hook()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index cec48724da2fe08d0eefcdf0d2df3c54c9aa363d..8e0a744ecdbdabac1a248ed1f0a0d08934749e55 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 8ccaf30cbdb34bcd215bd6b76431c7a6acfeaa3e..6c208160658820d57456e46f86b76659f4e4f80d 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f3180e21d8c63dd3fbc87d58c01f43422a01bcb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cases for role makers."""
+
+from __future__ import print_function
+import paddle
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+
+
+def compare(ref, res, atol, rtol):
+
+    ref = np.array(ref).flatten()
+    res = np.array(res).flatten()
+
+    tmp_ref = ref.astype(np.float)
+    tol = atol + rtol * abs(tmp_ref)
+
+    diff = abs(res - ref)
+
+    indices = np.transpose(np.where(diff > tol))
+    if len(indices) == 0:
+        return True
+    return False
+
+
+def verify_node_count(graph, node_name, target_count):
+    count = 0
+    for node in graph.nodes():
+        if node.name() == node_name:
+            count += 1
+    return count == target_count
+
+
+class MultiFCLayer(paddle.nn.Layer):
+    def __init__(self, hidden, Activation):
+        super(MultiFCLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(hidden, hidden)
+        self.linear2 = paddle.nn.Linear(hidden, hidden)
+        self.linear3 = paddle.nn.Linear(hidden, hidden)
+
+        self.relu1 = Activation()
+        self.relu2 = Activation()
+        self.relu3 = Activation()
+
+    def forward(self, x, matmul_y, ele_y):
+        output = self.linear1(x)
+        output = self.relu1(output)
+        output = self.linear2(output)
+
+        output1 = paddle.matmul(output, matmul_y)
+        output = self.linear3(output)
+        output = self.relu2(output)
+
+        output = paddle.matmul(output, matmul_y)
+        output = paddle.add(output, ele_y)
+        output = self.relu3(output)
+        output = paddle.add(output, output1)
+        return output
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueFWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+        self.reference = self.exe.run(self.main_prog,
+                                      feed=self.feed,
+                                      fetch_list=[self.loss.name])
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        result = self.exe.run(program,
+                              feed=self.feed,
+                              fetch_list=[self.loss.name])
+        self.assertTrue(
+            compare(self.reference, result, self.atol, self.rtol),
+            "[{}] outputs are miss-matched.".format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        act_fwd_name = self._get_act_type()[1]
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueBWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+                paddle.static.append_backward(loss=self.loss)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+
+        self.fetch = [
+            self.loss.name,
+            '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear3.full_name())
+        ]
+        self.outs_ref = self.exe.run(self.main_prog,
+                                     feed=self.feed,
+                                     fetch_list=self.fetch)
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch)
+
+        for ref, res in zip(self.outs_ref, outs_res):
+            self.assertTrue(
+                compare(ref, res, self.atol, self.rtol),
+                "[{}] output is miss-matched.".format(type(self).__name__))
+
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3),
+            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        _, act_fwd_name, act_bwd_name = self._get_act_type()
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+        self.assertTrue(
+            verify_node_count(program._graph, act_bwd_name, 2),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_bwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 5e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu", "gelu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index 9b739ebdfb23c680a86a54a3fa00398805ee8968..d391b04aa4772efbf7fadb7a9556aafd445197db 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 443703aa937d8aead8307b892961e7054ede6ed4..a3ae2a20dba23ef39510e962b148d40364f85e72 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -70,10 +70,12 @@ class TestFusedAttentionOp(OpTest):
         self.attn_mask_type = np.float64
         self.pre_layer_norm = False
         self.has_attn_mask = True
+        self.has_cache_kv = False
         self.training = True
 
         self.batch_size = 8
         self.query_length = 128
+        self.cache_length = 128
         self.head_dim = 64
         self.num_heads = 16
         self.embed_dim = self.head_dim * self.num_heads
@@ -88,10 +90,22 @@ class TestFusedAttentionOp(OpTest):
     def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
+        out_seq_len = self.key_length
+        if self.has_cache_kv:
+            assert self.training is False, ValueError(
+                'cache_kv can only used in inference')
+            self.cache_kv = np.random.rand(2, self.batch_size, self.num_heads,
+                                           self.cache_length,
+                                           self.head_dim).astype(self.x_type)
+            out_seq_len += self.cache_length
+        else:
+            self.cache_kv = None
+
         if self.has_attn_mask:
+            # [B, n_head, seq_len, out_seq_len]
             self.attn_mask = np.ones(
                 (self.batch_size, self.num_heads, self.query_length,
-                 self.key_length),
+                 out_seq_len),
                 dtype=self.attn_mask_type)
             if self.attn_mask_type == np.int64:
                 self.attn_mask = np.tril(self.attn_mask)
@@ -110,6 +124,11 @@ class TestFusedAttentionOp(OpTest):
     def GetBaselineOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
         tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
+
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
+
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -130,6 +149,18 @@ class TestFusedAttentionOp(OpTest):
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
         v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
+        if self.has_cache_kv:
+            # [1, B, n_head, cache_seq_len, head_dim]
+            cache_k, cache_v = paddle.split(cache_kv, 2)
+            cache_k = paddle.squeeze(cache_k, axis=0)
+            cache_v = paddle.squeeze(cache_v, axis=0)
+            # [B, n_head, cache_seq_len + seq_len, head_dim]
+            # out_seq_len = cache_seq_len + seq_len
+            k_out = paddle.concat([cache_k, k_out], axis=-2)
+            v_out = paddle.concat([cache_v, v_out], axis=-2)
+
+        # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
+        # --> [B, n_head, seq_len, out_seq_len]
         qk_out = layers.matmul(
             x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
 
@@ -146,6 +177,8 @@ class TestFusedAttentionOp(OpTest):
                 self.dropout_prob,
                 training=self.training,
                 mode="upscale_in_train")
+            # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
+            # --> [B, n_head, seq_len, head_dim]
             qktv_out = tensor.matmul(dropout_out, v_out)
         else:
             qktv_out = tensor.matmul(softmax_out, v_out)
@@ -160,6 +193,10 @@ class TestFusedAttentionOp(OpTest):
             final_out = self.norm1(residual_out)
         else:
             final_out = residual_out
+
+        if self.has_cache_kv:
+            return final_out
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, tensor_query.grad
@@ -206,6 +243,9 @@ class TestFusedAttentionOp(OpTest):
             (3, self.num_heads, self.head_dim, self.embed_dim))
 
         x = paddle.to_tensor(self.query, stop_gradient=False)
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -219,8 +259,12 @@ class TestFusedAttentionOp(OpTest):
         final_out = incubate_f.fused_multi_head_attention(
             x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm,
             ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor,
-            out_linear_bias, attn_mask, self.dropout_prob,
+            out_linear_bias, cache_kv, attn_mask, self.dropout_prob,
             self.attn_dropout_prob, ln2_epsilon)
+
+        if self.has_cache_kv:
+            return final_out[0], final_out[1]
+
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, x.grad
@@ -236,114 +280,27 @@ class TestFusedAttentionOp(OpTest):
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
+        super().config()
         self.bias_attr = False
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
     def config(self):
-        self.x_type = np.float32
-        self.attn_mask_type = np.float64
+        super().config()
         self.pre_layer_norm = True
         self.has_attn_mask = False
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
-
-    def test_fused_attention_op(self):
-        final_out_ref, x_grad_ref = self.GetBaselineOut()
-        final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
     def config(self):
+        super().config()
         self.x_type = np.float16
-        self.attn_mask_type = np.float64
-        self.pre_layer_norm = False
-        self.has_attn_mask = True
-        self.training = True
-
-        self.batch_size = 8
-        self.query_length = 128
-        self.head_dim = 64
-        self.num_heads = 16
-        self.embed_dim = self.head_dim * self.num_heads
-
-        self.dropout_prob = 0.0
-        self.attn_dropout_prob = 0.0
-        self.weight_attr = None
-        self.bias_attr = None
-        self.kdim, self.vdim = self.embed_dim, self.embed_dim
-        self.key_length, self.value_length = self.query_length, self.query_length
 
     def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
@@ -354,5 +311,21 @@ class TestFusedAttentionOpFp16(TestFusedAttentionOp):
             x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
 
 
+class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.training = False
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+
+    def test_fused_attention_op(self):
+        with paddle.no_grad():
+            final_out_ref = self.GetBaselineOut()
+            final_out, cache_kv_out = self.GetFusedAttentionOut()
+            np.testing.assert_allclose(
+                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea1bf2e9cb8105280a4f2635279518d125a4312
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def get_outputs(DOut, X, Y):
+    DX = np.dot(DOut, Y.T)
+    DY = np.dot(X.T, DOut)
+    DBias = np.sum(DOut, axis=0)
+
+    return DX, DY, DBias
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                    self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP32(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP64(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                   self.inputs['Y'])
+        self.outputs = {'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP32(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP64(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                               self.inputs['Y'])
+        self.outputs = {'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f826898f9e5dd601b54eaeb1c54216414a70246b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def gelu(x):
+    y_ref = 0.5 * x * (
+        1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    return y_ref.astype(x.dtype)
+
+
+def relu(x):
+    mask = x > 0
+    return x * mask
+
+
+def get_output(X, Y, bias, act):
+    out = np.dot(X, Y) + bias
+    if act == 'relu':
+        return relu(out)
+    elif act == 'gelu':
+        return gelu(out)
+    else:
+        return out
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (-1, 4)), self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (4, -1)).T, self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'gelu'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'gelu')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'none')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 31caf4bd6be984e2d1579f7756c8ba0979db832f..738441a46d377ec920c911c2712bcdc7bab6dbf0 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,7 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float
 import paddle
 
 
@@ -65,6 +65,50 @@ class TestGaussianRandomOp(OpTest):
             "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestGaussianRandomBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.set_attrs()
+        self.inputs = {}
+        self.use_mkldnn = False
+        self.attrs = {
+            "shape": [123, 92],
+            "mean": self.mean,
+            "std": self.std,
+            "seed": 10,
+            "dtype": paddle.fluid.core.VarDesc.VarType.BF16,
+            "use_mkldnn": self.use_mkldnn
+        }
+        paddle.seed(10)
+
+        self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
+
+    def set_attrs(self):
+        self.mean = 1.0
+        self.std = 2.
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(
+            self.verify_output, place=core.CUDAPlace(0))
+
+    def verify_output(self, outs):
+        outs = convert_uint16_to_float(outs)
+        self.assertEqual(outs[0].shape, (123, 92))
+        hist, _ = np.histogram(outs[0], range=(-3, 5))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        data = np.random.normal(size=(123, 92), loc=1, scale=2)
+        hist2, _ = np.histogram(data, range=(-3, 5))
+        hist2 = hist2.astype("float32")
+        hist2 /= float(outs[0].size)
+        self.assertTrue(
+            np.allclose(
+                hist, hist2, rtol=0, atol=0.05),
+            "hist: " + str(hist) + " hist2: " + str(hist2))
+
+
 class TestMeanStdAreInt(TestGaussianRandomOp):
     def set_attrs(self):
         self.mean = 1
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index 04a0e5e4cd10f7ece370e879986056d508c894ff..3e222e3c658ecd105811f3694a25d20f1826bcda 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -24,6 +24,7 @@ import paddle.fluid.core as core
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph import Linear
+from paddle.fluid.framework import _test_eager_guard
 
 # Can use Amusic dataset as the DeepCF describes.
 DATA_PATH = os.environ.get('DATA_PATH', '')
@@ -294,9 +295,42 @@ class TestDygraphDeepCF(unittest.TestCase):
                     sys.stderr.write('dynamic loss: %s %s\n' %
                                      (slice, dy_loss2))
 
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                paddle.seed(seed)
+                paddle.framework.random._manual_program_seed(seed)
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+
+                deepcf = DeepCF(num_users, num_items, matrix)
+                adam = fluid.optimizer.AdamOptimizer(
+                    0.01, parameter_list=deepcf.parameters())
+
+                for e in range(NUM_EPOCHES):
+                    sys.stderr.write('epoch %d\n' % e)
+                    for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                        if slice + BATCH_SIZE >= users_np.shape[0]:
+                            break
+                        prediction = deepcf(
+                            to_variable(users_np[slice:slice + BATCH_SIZE]),
+                            to_variable(items_np[slice:slice + BATCH_SIZE]))
+                        loss = fluid.layers.reduce_sum(
+                            fluid.layers.log_loss(prediction,
+                                                  to_variable(
+                                                      labels_np[slice:slice +
+                                                                BATCH_SIZE])))
+                        loss.backward()
+                        adam.minimize(loss)
+                        deepcf.clear_gradients()
+                        eager_loss = loss.numpy()
+                        sys.stderr.write('eager loss: %s %s\n' %
+                                         (slice, eager_loss))
+
         self.assertEqual(static_loss, dy_loss)
         self.assertEqual(static_loss, dy_loss2)
+        self.assertEqual(static_loss, eager_loss)
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 973c559857974a66ee1c06fc02305adf29c8f0ad..09868520b4c286c0d6b1f7c1cae160be5796199e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -22,6 +22,7 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Config(object):
@@ -371,7 +372,7 @@ class OCRAttention(fluid.dygraph.Layer):
 
 
 class TestDygraphOCRAttention(unittest.TestCase):
-    def test_while_op(self):
+    def test_ocr_test(self):
         seed = 90
         epoch_num = 1
         if core.is_compiled_with_cuda():
@@ -400,7 +401,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
                 i * Config.max_length,
                 dtype='int64').reshape([1, Config.max_length])))
 
-        with fluid.dygraph.guard():
+        def run_dygraph():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -452,6 +453,16 @@ class TestDygraphOCRAttention(unittest.TestCase):
                     for param in ocr_attention.parameters():
                         dy_param_value[param.name] = param.numpy()
 
+            return dy_out, dy_param_init_value, dy_param_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value = run_dygraph()
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -537,6 +548,17 @@ class TestDygraphOCRAttention(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05))
 
+        # check eager here
+        self.assertTrue(np.allclose(static_out, eager_out))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.array_equal(value, eager_param_init_value[key]))
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(
+                np.allclose(
+                    value, eager_param_value[key], rtol=1e-05))
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
index d0b3adc490945377a4dd05f9c414cd5c35c7fae5..f12ca0a93ffd9441761c2da866c2c811a30c6e68 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
@@ -16,9 +16,11 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
@@ -60,6 +62,25 @@ class TestRecurrentFeed(unittest.TestCase):
                 original_in1.stop_gradient = True
                 rt.clear_gradients()
 
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+                original_in1 = to_variable(original_np1)
+                original_in2 = to_variable(original_np2)
+                original_in1.stop_gradient = False
+                original_in2.stop_gradient = False
+                rt = RecurrentTest("RecurrentTest")
+
+                for i in range(3):
+                    sum_out, out = rt(original_in1, original_in2)
+                    original_in1 = out
+                    eager_sum_out_value = sum_out.numpy()
+                    sum_out.backward()
+                    eager_dyout = out.gradient()
+                    original_in1.stop_gradient = True
+                    rt.clear_gradients()
+
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -88,8 +109,11 @@ class TestRecurrentFeed(unittest.TestCase):
                 original_np1 = static_out_value
 
         self.assertTrue(np.array_equal(static_sum_out, sum_out_value))
+        self.assertTrue(np.array_equal(static_sum_out, eager_sum_out_value))
         self.assertTrue(np.array_equal(static_dout, dyout))
+        self.assertTrue(np.array_equal(static_dout, eager_dyout))
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index a89628c594de9f7babd6a82ba178ba00829fccfe..08320d04d99964e61378076f05d71154940cd578 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Policy(fluid.dygraph.Layer):
@@ -63,7 +64,7 @@ class TestImperativeMnist(unittest.TestCase):
         mask_list = [[0, 1]]
         mask = np.array(mask_list).astype("float32")
 
-        with fluid.dygraph.guard():
+        def run_dygraph():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
@@ -104,6 +105,16 @@ class TestImperativeMnist(unittest.TestCase):
             for param in policy.parameters():
                 dy_param_value[param.name] = param.numpy()
 
+            return dy_out, dy_param_init_value, dy_param_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value = run_dygraph()
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -171,6 +182,16 @@ class TestImperativeMnist(unittest.TestCase):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.equal(value, dy_param_value[key]).all())
 
+        # check eager
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.equal(value, eager_param_init_value[key]).all())
+
+        self.assertTrue(np.equal(static_out, eager_out).all())
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.equal(value, eager_param_value[key]).all())
+
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 8f8890557ad12328c88eedc25cb95a91f885eaf4..3fbb7f4cf7bb734892d55ff6440b51dd71ca2f3e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -24,6 +24,7 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 if fluid.is_compiled_with_cuda():
     fluid.set_flags({'FLAGS_cudnn_deterministic': True})
@@ -310,7 +311,8 @@ class TestImperativeResneXt(unittest.TestCase):
         batch_size = train_parameters["batch_size"]
         batch_num = 1
         epoch_num = 1
-        with fluid.dygraph.guard():
+
+        def run_dygraph():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
@@ -371,6 +373,17 @@ class TestImperativeResneXt(unittest.TestCase):
                     for param in se_resnext.parameters():
                         dy_param_value[param.name] = param.numpy()
 
+                    return dy_out, dy_param_init_value, dy_param_value, dy_grad_value
+
+        with fluid.dygraph.guard():
+            dy_out, dy_param_init_value, dy_param_value, dy_grad_value = run_dygraph(
+            )
+
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                eager_out, eager_param_init_value, eager_param_value, eager_grad_value = run_dygraph(
+                )
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -479,6 +492,32 @@ class TestImperativeResneXt(unittest.TestCase):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+        # check eager
+        self.assertTrue(
+            np.allclose(static_out, eager_out),
+            "\nstatic_out: {}\neager_out: {}".format(static_out, eager_out))
+
+        self.assertEqual(
+            len(eager_param_init_value), len(static_param_init_value))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, eager_param_init_value[key]))
+
+        self.assertEqual(len(eager_grad_value), len(static_grad_value))
+
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(
+                np.allclose(value, eager_grad_value[key]),
+                "\nstatic_grad_value: {}\neager_grad_value: {}".format(
+                    value, eager_grad_value[key]))
+
+        self.assertEqual(len(eager_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(
+                np.allclose(value, eager_param_value[key]),
+                "\nstatic_param_value: {}\neagear_param_value: {}".format(
+                    value, eager_param_value[key]))
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 33f304ef33d67004585a01142d24b17bfd4908bc..0a08aa4ba12693e2216ae1b131ea41a5abaabd2a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 3f129cae44abb6e8e9b4d329558aea5167c96675..010c8aeccacd6550a5b8963c63f3d28af898550e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -21,6 +21,7 @@ from paddle.fluid import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph import to_variable, guard
 from paddle.fluid.dygraph import TracedLayer
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid import core
 import numpy as np
 import six
@@ -949,8 +950,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
     def transformer_sort_gradient_float32(self, is_sparse):
         seed = 90
 
-        with guard():
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+        def run_dygraph():
             # NOTE(xiongkun03): In new executor, the inplace strategy is on by default, which will cause result of sumop have some differences. So we disable inplace.
             fluid.set_flags({'FLAGS_new_executor_use_inplace': False})
             paddle.seed(seed)
@@ -998,7 +998,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
 
             for i in range(batch_num):
                 enc_inputs, dec_inputs, label, weights = create_data()
-                if i % 2 == 0:
+                if False:
                     outs, traced_layer = TracedLayer.trace(
                         transformer, [enc_inputs, dec_inputs, label, weights])
 
@@ -1036,6 +1036,14 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
             dy_predict_value = dy_predict.numpy()
             dy_token_num_value = dy_token_num.numpy()
 
+            return dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated
+
+        with guard():
+            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
+            dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated = run_dygraph()
+
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -1122,6 +1130,28 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
         for key, value in six.iteritems(static_param_updated):
             self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
+        # check eager result
+        with guard():
+            fluid.set_flags({'FLAGS_sort_sum_gradient': False})
+            dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
+                dy_param_init, dy_param_updated = run_dygraph()
+
+        with guard():
+            with _test_eager_guard():
+                eager_avg_cost_value, eager_sum_cost_value, eager_predict_value, eager_token_num_value, \
+                    eager_param_init, eager_param_updated = run_dygraph()
+        self.assertTrue(np.allclose(dy_avg_cost_value, eager_avg_cost_value))
+        self.assertTrue(np.allclose(dy_sum_cost_value, eager_sum_cost_value))
+
+        self.assertTrue(np.allclose(dy_predict_value, eager_predict_value))
+        self.assertTrue(np.allclose(dy_token_num_value, eager_token_num_value))
+
+        for key, value in six.iteritems(static_param_init):
+            self.assertTrue(np.array_equal(value, eager_param_init[key]))
+        for key, value in six.iteritems(dy_param_updated):
+            self.assertTrue(np.allclose(value, eager_param_updated[key]))
+
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 077496200d988fafc67bb6f85892adc99c170daf..67f6b91021472ec3f678010f22bafb615cd8eb2c 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -23,6 +23,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid import compiler
 import paddle.fluid.unique_name as unique_name
+import paddle
 
 
 class TestInplaceANBOpTraining(unittest.TestCase):
@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
                 outs[0].name if not only_forward else None,
                 build_strategy=build_strategy,
                 exec_strategy=exec_strategy)
-            bn_fetches = exe.run(program=comp_prog1,
+            bn_fetches = exe.run(program=main,
                                  feed={'input': data},
                                  fetch_list=fetch_name)
             fetch_outs.append(bn_fetches)
             fetch_names.append(fetch_name)
 
-        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
-                                                           fetch_names)):
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(
+                fetch_outs + fetch_names)):
             self.assertTrue(
                 np.allclose(
                     bn_val, inplace_abn_val, atol=1e-2),
@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 
     def test_op(self):
         use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        #use_cudas = [False]
         for use_cuda in use_cudas:
             place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
             layouts = ["NCHW", "NHWC"]
@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
new file mode 100755
index 0000000000000000000000000000000000000000..12fb0fa61b005f49e6b258414cd5e3e10b0dae92
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.profiler as profiler
+
+
+class TestProfiler(unittest.TestCase):
+    def test_profiler(self):
+        def my_trace_back(prof):
+            profiler.export_chrome_tracing('./test_profiler_chrometracing/')(
+                prof)
+            profiler.export_protobuf('./test_profiler_pb/')(prof)
+
+        x_value = np.random.randn(2, 3, 3)
+        x = paddle.to_tensor(
+            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        y = x / 2.0
+        ones_like_y = paddle.ones_like(y)
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=(1, 2)) as prof:
+            with profiler.RecordEvent(name='test'):
+                y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=1, record=1, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=0, record=2, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(3):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+
+        def my_sheduler(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD_AND_RETURN
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        def my_sheduler1(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler1) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=1, ready=1, record=2, repeat=1, skip_first=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
+                prof.step()
+
+        prof.export(path='./test_profiler_pb.pb', format='pb')
+        prof.summary()
+        result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
new file mode 100755
index 0000000000000000000000000000000000000000..05e792003545688cfef3f1c7bd48ecf4d27daafe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.profiler.statistic_helper as statistic_helper
+
+
+class TestStatisticHelper(unittest.TestCase):
+    def test_sum_ranges_case1(self):
+        src = [(1, 3), (4, 10), (11, 15)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 12)
+
+    def test_sum_ranges_case2(self):
+        src = [(3, 3), (5, 5), (7, 7)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 0)
+
+    def test_merge_self_ranges_case1(self):
+        src = [(1, 5), (2, 7), (4, 9), (14, 19)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+        src = [(4, 9), (14, 19), (1, 5), (2, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+
+    def test_merge_self_ranges_case2(self):
+        src = [(1, 1), (2, 3), (4, 7), (5, 12)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+        src = [(5, 12), (1, 1), (2, 3), (4, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+
+    def test_merge_ranges_case1(self):
+        src1 = [(1, 2), (5, 7), (9, 14)]
+        src2 = [(1, 2), (4, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(1, 2), (3, 5)]
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        src1 = []
+        src2 = [(1, 2), (3, 5)]
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src2)
+        src1 = [(3, 4), (1, 2), (17, 19)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        src1 = [(1, 2), (5, 9), (12, 13)]
+        src2 = [(6, 8), (9, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+
+    def test_merge_ranges_case2(self):
+        src1 = [(3, 4), (1, 2), (9, 14)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 15)])
+        src2 = [(9, 14), (1, 2), (5, 7)]
+        src1 = [(4, 9), (1, 2), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+
+    def test_intersection_ranges_case1(self):
+        src1 = [(1, 7), (9, 12), (14, 18)]
+        src2 = [(3, 8), (10, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(3, 7), (10, 12)]
+        src2 = [(2, 9), (11, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+        dst = statistic_helper.intersection_ranges(src2, src1)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+
+    def test_intersection_ranges_case2(self):
+        src2 = [(9, 12), (1, 7), (14, 18)]
+        src1 = [(10, 13), (3, 8), (15, 19), (20, 22)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src2 = [(1, 7), (14, 18), (21, 23)]
+        src1 = [(6, 9), (10, 13)]
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(6, 7)])
+
+    def test_subtract_ranges_case1(self):
+        src1 = [(1, 10), (12, 15)]
+        src2 = [(3, 7), (9, 11)]
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 3), (7, 9), (12, 15)])
+        src1 = [(1, 10), (12, 15)]
+        src2 = []
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        dst = statistic_helper.subtract_ranges(src2, src1, True)
+        self.assertEqual(dst, src2)
+
+    def test_subtract_ranges_case2(self):
+        src2 = [(12, 15), (1, 10)]
+        src1 = [(9, 11), (3, 7)]
+        dst = statistic_helper.subtract_ranges(src1, src2)
+        self.assertEqual(dst, [(10, 11)])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index fe8c181b7904989d75d85520aaf6f658ac673d29..49fe397644dc6e79d1b0d436b31c706d77906b6a 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -21,6 +21,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
+import paddle
 
 from decorator_helper import prog_scope
 
@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 66de1b309797fb53316a46b436e5cccf11410216..fac258192112dbff5353c581ad8e276cc5e375c0 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -22,7 +22,8 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
+from paddle.framework import _in_eager_mode
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
 
 
 class TestOneHotOp(OpTest):
@@ -45,7 +46,7 @@ class TestOneHotOp(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_attr(OpTest):
@@ -68,7 +69,7 @@ class TestOneHotOp_attr(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype(OpTest):
@@ -91,7 +92,7 @@ class TestOneHotOp_default_dtype(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
@@ -114,7 +115,7 @@ class TestOneHotOp_default_dtype_attr(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_out_of_range(OpTest):
@@ -132,7 +133,7 @@ class TestOneHotOp_out_of_range(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_exception(unittest.TestCase):
@@ -190,6 +191,12 @@ class TestOneHotOpApi(unittest.TestCase):
             one_hot_label = fluid.one_hot(
                 input=fluid.dygraph.to_variable(label), depth=depth)
 
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            with _test_eager_guard():
+                one_hot_label = paddle.nn.functional.one_hot(
+                    paddle.to_tensor(label), depth)
+
     def _run(self, depth):
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         one_hot_label = fluid.one_hot(input=label, depth=depth)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e31356a6bc81c1684a3620d36b66ed441add40b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import gc
+import sys
+import unittest
+import time
+import paddle
+import paddle.incubate.multiprocessing as mp
+
+REPEAT = 20
+HAS_SHM_FILES = os.path.isdir('/dev/shm')
+
+
+def fill_tensor(queue, event):
+    data = queue.get()
+    with paddle.no_grad():
+        data[0][:] = 5
+        data[1][:] = 5
+
+    event.set()
+
+
+def send_tensor(queue, event, device, dtype):
+    tensor = paddle.ones([5, 5], dtype=dtype)
+    queue.put(tensor)
+    queue.put(tensor)
+    event.wait()
+
+
+def send_parambase(queue, event, device, dtype):
+    tensor = paddle.nn.Layer().create_parameter(
+        [5, 5],
+        dtype=dtype,
+        default_initializer=paddle.nn.initializer.Constant(value=1.0))
+    queue.put(tensor)
+    queue.put(tensor)
+    event.wait()
+
+
+class leak_checker(object):
+    def __init__(self, test_case):
+        self.checked_pids = [os.getpid()]
+        self.test_case = test_case
+
+    def __enter__(self):
+        self.next_fds = self._get_next_fds(10)
+        return self
+
+    def __exit__(self, *args):
+        if args[0] is None:
+            self.test_case.assertFalse(self.has_shm_files())
+        return False
+
+    def check_pid(self, pid):
+        self.checked_pids.append(pid)
+
+    def _get_next_fds(self, n=1):
+        fds = [os.dup(0) for i in range(n)]
+        for fd in fds:
+            os.close(fd)
+        return fds
+
+    def has_shm_files(self, wait=True):
+        if not HAS_SHM_FILES:
+            return False
+        result = self._has_shm_files()
+        if result and wait:
+            time.sleep(0.5)
+            return self._has_shm_files()
+        return result
+
+    def _has_shm_files(self):
+        gc.collect()
+        names = ['paddle_' + str(pid) for pid in self.checked_pids]
+        for filename in os.listdir('/dev/shm'):
+            for name in names:
+                if filename.startswith(name):
+                    print("have", filename)
+                    return True
+        return False
+
+
+class TestMultiprocessingBase(unittest.TestCase):
+    def get_tensor(self, device="cpu"):
+        self.device = device.lower()
+        place = None
+        tensor = paddle.zeros([5, 5], dtype="float32")
+        return tensor
+
+    def get_parameter(self):
+        w = paddle.nn.Layer().create_parameter(
+            [10, 10],
+            default_initializer=paddle.nn.initializer.Constant(value=0.0))
+        return w
+
+    def _test_empty(self, dtype="float32"):
+        q = mp.Queue()
+        empty = paddle.to_tensor([], dtype=dtype)
+        q.put(empty)
+        out = q.get(timeout=1)
+        self.assertEqual(str(out), str(empty))
+
+    def _test_sharing(self,
+                      ctx=mp,
+                      device='cpu',
+                      dtype="float32",
+                      repeat=1,
+                      param=False):
+        def test_fill():
+            if param:
+                x = self.get_parameter()
+                y = (x[:, 1]).detach()
+            else:
+                x = self.get_tensor()
+                y = x[:, 1]
+
+            data = [x, y]
+
+            queue = ctx.Queue()
+            event = ctx.Event()
+            queue.put(data)
+
+            process = ctx.Process(target=fill_tensor, args=(queue, event))
+            process.daemon = True
+            lc.check_pid(process.pid)
+            process.start()
+
+            event.wait(30)
+
+            self.assertTrue(event.is_set())
+            self.assertTrue(data[0].equal(5).all())
+            self.assertTrue(data[1].equal(5).all())
+
+            process.join(1 if device != "gpu" else 10)
+            self.assertFalse(process.is_alive())
+
+        def test_receive():
+            queue = ctx.Queue()
+            event = ctx.Event()
+
+            process = ctx.Process(
+                target=send_parambase if param else send_tensor,
+                args=(queue, event, device, dtype))
+            process.daemon = True
+            lc.check_pid(process.pid)
+            process.start()
+
+            t1 = queue.get()
+            t2 = queue.get()
+            self.assertTrue(t1.equal(1).all())
+            del t1, t2
+
+            event.set()
+            process.join(1 if device != "gpu" else 10)
+            self.assertFalse(process.is_alive())
+
+        with leak_checker(self) as lc:
+            for _ in range(repeat):
+                test_fill()
+                test_receive()
+
+
+class TestMultiprocessingCpu(TestMultiprocessingBase):
+    def test_pass_tensor(self):
+        paddle.set_device("cpu")
+        self._test_sharing(repeat=REPEAT)
+
+    def test_pass_parambase(self):
+        paddle.set_device("cpu")
+        self._test_sharing(repeat=1, param=True)
+
+    def test_pass_empty(self):
+        paddle.set_device("cpu")
+        self._test_empty()
+
+
+class TestMultiprocessingGpu(TestMultiprocessingBase):
+    @unittest.skipIf(not paddle.fluid.core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def test_pass_tensor(self):
+        paddle.set_device("gpu")
+        self._test_sharing(mp.get_context("spawn"), "gpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index edf9aed04f5e0a1df3ceb1ec0add27251e2264a5..802fcc96288f6114e81f080ef17e527b2e7ad2bd 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -200,5 +200,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus):
         self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py')
 
 
+class TestDataParallelInEagerMode(TestMultipleGpus):
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
new file mode 100644
index 0000000000000000000000000000000000000000..838ccae37cfa5fb7dbdedcb5d39655cb62ad429f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.profiler as profiler
+
+
+class HostPythonNode:
+    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+
+
+class DevicePythonNode:
+    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
+                 stream_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.device_id = device_id
+        self.context_id = context_id
+        self.stream_id = stream_id
+
+
+class TestProfilerStatistic(unittest.TestCase):
+    def test_statistic_case1(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.append(sync_batch_norm_node)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 90)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 55)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 30)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 75)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 15)
+        self.assertEqual(len(event_summary.items), 2)
+        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 135)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index b01c7cf179955d89746555e3d085361784193b8c..a1a3b31a9766e093973ed927f90eebb989e1263b 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
 import seresnext_net
 from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
 from fake_reader import fake_imdb_reader
+import paddle
 
 
 def lstm_net(use_feed):
@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index c860d6972fb762f5c9cde1fb17c36bfb415a3f8a..40481b097827cbc7b66d72403d24fd41853af0a7 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe5fb9bb9455aa58d84bda03f9e9e16038a3be0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import subprocess
+import sys, os
+import json
+import shutil
+
+import random
+
+from os import listdir
+from os.path import isfile, join
+
+pyname = 'train.py'
+colpyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_MASTER" in env
+assert "PADDLE_GLOBAL_SIZE" in env
+assert "PADDLE_LOCAL_SIZE" in env
+assert "PADDLE_GLOBAL_RANK" in env
+assert "PADDLE_LOCAL_RANK" in env
+'''
+
+pspyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
+assert "PADDLE_TRAINER_ENDPOINTS" in env
+#assert "PADDLE_PSERVER_ENDPOINTS" in env
+#assert "PADDLE_TRAINER_ENDPOINTS" in env
+#assert "PADDLE_ROLE" in env
+#assert "PADDLE_RANK" in env
+'''
+
+
+def write_file(name, ct):
+    with open(name, "w") as f:
+        f.write(ct)
+
+
+def get_files(pth, prefix):
+    return [
+        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+    ]
+
+
+class Collective_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, colpyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    '''
+    def test_collective_1(self):
+        args = "--id test1"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    '''
+
+    def test_collective_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--id test2 --devices 0,1,2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'test2')
+        self.assertTrue(len(c) == 4)
+
+    def test_collective_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'test3')
+        self.assertTrue(len(c) == 6)
+
+
+class PS_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, pspyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    '''
+    def test_ps_1(self):
+        args = "--mode ps"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    def test_ps_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--id ps2 --server_num=2 --trainer_num=2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'ps2')
+        self.assertTrue(len(c) == 5)
+    '''
+
+    def test_ps_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'ps3')
+        self.assertTrue(len(c) == 6)
+
+    def test_ps_4(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
+        p1 = self.pdrun(args)
+        p1.wait()
+        self.assertTrue(p1.poll() == 0)
+
+        c = get_files('log', 'ps4')
+        self.assertTrue(len(c) == 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index e71adae8d9b6eb5cdfdbd648a3cfe653cfed3d3d..f16198817945ab826004d51f1264a9cbe8fc22a9 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -42,6 +42,7 @@ def ref_selu(x,
 class SeluTest(OpTest):
     def setUp(self):
         self.op_type = "selu"
+        self.python_api = paddle.nn.functional.selu
         self.x_shape = [3, 5, 5, 10]
         self.dtype = np.float64
         self.init_x_shape()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ce8e8170fa187b223fe48aac6124fa0b736e17
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_attention.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6b637e1b45ed51dbefd2f8a6ac4bcf4d1f4e59
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_feedforward.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 3238876b89414b89d09a8b4161ef9e5ba2450261..aac8b6a99b649176d29224e22c3d3258d96c194e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -533,12 +533,8 @@ class TestTensorRegisterHook(unittest.TestCase):
             size=[self.batch_size, self.in_size]).astype('float32')
         data_t = paddle.to_tensor(data)
 
-        if _in_eager_mode():
-            with self.assertRaises(TypeError):
-                out = jit_net(data_t)
-        else:
-            with self.assertRaises(AssertionError):
-                out = jit_net(data_t)
+        with self.assertRaises(AssertionError):
+            out = jit_net(data_t)
 
     def test_register_hook_in_dy2static_mode(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 08a35db3ac4374e392adadfb06410667448daa5c..b70fa04adc13cfd16c43010cce46b31893052927 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -79,6 +79,16 @@ class TestTruncAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager(self):
+        paddle.disable_static(self.place)
+
+        with _test_eager_guard():
+            x_tensor = paddle.to_tensor(self.x)
+            out = paddle.trunc(x_tensor)
+        out_ref = np.trunc(self.x)
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
+        paddle.enable_static()
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_api_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 043c5c1651a09ac022d8a694b2e916b613c77f6b..f210d97362cf062260594dce1112059919f179c4 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -260,5 +260,6 @@ class TestYoloBoxOpHW(TestYoloBoxOp):
         self.iou_aware_factor = 0.5
 
 
-if (__name__ == '__main__'):
+if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 1ec1d1527e178a35fba5a2b949191f72e0a7726b..3f0e4f7a4002a29a2f3f17915392b13fdb342677 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -305,6 +305,7 @@ class TestYolov3LossDygraph(unittest.TestCase):
             use_label_smooth=True,
             scale_x_y=1.)
         assert loss is not None
+        assert loss.shape == [2]
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 6a7e5f08b5e489511d2060d92dc0390dd4b18dbc..69bca8dd9ef15459021f44fd1b4887e636516ec6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -84,13 +84,33 @@ class XPUTestSigmoidOP(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = "sigmoid"
             self.dtype = self.in_type
+            self.init_config()
+            out = 1 / (1 + np.exp(-self.x))
 
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = 1 / (1 + np.exp(-x))
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSigmoid2(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSigmoid3(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [10, 12, 15]).astype(self.dtype)
+
+    class XPUTestSigmoid4(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [19, 19]).astype(self.dtype)
+
+    class XPUTestSigmoid5(XPUTestSigmoid):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2,
+                                       [10, 20, 30, 40]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('sigmoid')
 for stype in support_types:
@@ -292,14 +312,32 @@ class XPUTestSquareOP(XPUOpTestWrapper):
         def set_case(self):
             self.op_type = "square"
             self.dtype = self.in_type
-
-            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            out = np.square(x)
+            self.init_config()
+            out = np.square(self.x)
 
             self.attrs = {'use_xpu': True}
-            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
             self.outputs = {'Out': out}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+
+    class XPUTestSquare2(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+
+    class XPUTestSquare3(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 15, 19]).astype(self.dtype)
+
+    class XPUTestSquare4(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [100, 10]).astype(self.dtype)
+
+    class XPUTestSquare5(XPUTestSquare):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1, 2, 5, 17]).astype(self.dtype)
+
 
 support_types = get_xpu_op_support_types('square')
 for stype in support_types:
@@ -436,5 +474,410 @@ def ref_softplus(x, beta=1, threshold=20):
     return out
 
 
+# XPU_KP unittests, these ops can be found from xpu_op_kpfirst_list.h
+class XPUTestBReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'brelu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestBRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "brelu"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-5, 10, [10, 12]).astype(self.dtype)
+            t_min = 1.0
+            t_max = 4.0
+            # The same with TestAbs
+            x[np.abs(x - t_min) < 0.005] = t_min + 0.02
+            x[np.abs(x - t_max) < 0.005] = t_max + 0.02
+            t = np.copy(x)
+            t[t < t_min] = t_min
+            t[t > t_max] = t_max
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': t}
+            self.attrs = {'use_xpu': True, 't_min': t_min, 't_max': t_max}
+
+
+support_types = get_xpu_op_support_types('brelu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestBReluOP, stype)
+
+
+class XPUTestCeilOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'ceil'
+        self.use_dynamic_create_class = False
+
+    class XPUTestCeil(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "ceil"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = np.ceil(x)
+
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('ceil')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCeilOP, stype)
+
+
+class XPUTestCeluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'celu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestCelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "celu"
+            self.dtype = self.in_type
+
+            alpha = 1.5
+            x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
+            out = ref_celu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('celu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCeluOP, stype)
+
+
+def ref_celu(x, alpha):
+    out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x / alpha) - 1))
+    return out_ref.astype(x.dtype)
+
+
+class XPUTestEluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'elu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestElu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "elu"
+            self.dtype = self.in_type
+
+            alpha = 1.
+            x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
+            out = ref_elu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('elu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestEluOP, stype)
+
+
+def ref_elu(x, alpha):
+    out_ref = np.where(x > 0, x, alpha * (np.exp(x) - 1))
+    return out_ref.astype(x.dtype)
+
+
+class XPUTestFloorOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'floor'
+        self.use_dynamic_create_class = False
+
+    class XPUTestFloor(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "floor"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = np.floor(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('floor')
+for stype in support_types:
+    create_test_class(globals(), XPUTestFloorOP, stype)
+
+
+class XPUTestHardShrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_shrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestHardShrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_shrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            # self.set_attrs()
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10
+            out = ref_hardshrink(x, threshold)
+
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('hard_shrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardShrinkOP, stype)
+
+
+def ref_hardshrink(x, threshold):
+    out = np.copy(x)
+    out[(out >= -threshold) & (out <= threshold)] = 0
+    return out
+
+
+class XPUTestHardSigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_sigmoid'
+        self.use_dynamic_create_class = False
+
+    class XPUTestHardSigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_sigmoid"
+            self.dtype = self.in_type
+            self.slope = 0.166666666666667
+            self.offset = 0.5
+
+            x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+            lower_threshold = -self.offset / self.slope
+            upper_threshold = (1. - self.offset) / self.slope
+
+            # Same reason as TestAbs
+            delta = 0.005
+            x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+            x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
+
+            out = ref_hardsigmoid(x, self.slope, self.offset)
+
+            self.attrs = {
+                'use_xpu': True,
+                'slope': self.slope,
+                'offset': self.offset
+            }
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('hard_sigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardSigmoidOP, stype)
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
+
+
+class XPUTestLog1pOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'log1p'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLog1p(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "log1p"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+            out = np.log1p(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('log1p')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLog1pOP, stype)
+
+
+class XPUTestLogsigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'logsigmoid'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLogsigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "logsigmoid"
+            self.dtype = self.in_type
+
+            np.random.seed(2048)
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = np.log(1 / (1 + np.exp(-x)))
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('logsigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLogsigmoidOP, stype)
+
+
+class XPUTestRelu6OP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'relu6'
+        self.use_dynamic_create_class = False
+
+    class XPUTestRelu6(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "relu6"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_relu6(x)
+
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('relu6')
+for stype in support_types:
+    create_test_class(globals(), XPUTestRelu6OP, stype)
+
+
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
+class XPUTestSiluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'silu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSilu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "silu"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = x / (np.exp(-x) + 1)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('silu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSiluOP, stype)
+
+
+class XPUTestSoftReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'soft_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "soft_relu"
+            self.dtype = self.in_type
+
+            np.random.seed(4096)
+            x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
+            threshold = 2.0
+            # The same reason with TestAbs
+            x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+            x[np.abs(x + threshold) < 0.005] = -threshold - 0.02
+            t = np.copy(x)
+            t[t < -threshold] = -threshold
+            t[t > threshold] = threshold
+            out = np.log((np.exp(t) + 1))
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'threshold': threshold}
+
+
+support_types = get_xpu_op_support_types('soft_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftReluOP, stype)
+
+
+class XPUTestSoftSignOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softsign'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftSign(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softsign"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = ref_softsign(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softsign')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftSignOP, stype)
+
+
+def ref_softsign(x):
+    out = np.divide(x, 1 + np.abs(x))
+    return out
+
+
+class XPUTestSwishOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'swish'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSwish(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "swish"
+            self.dtype = self.in_type
+
+            np.random.seed(1024)
+            x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+            out = ref_swish(x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('swish')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSwishOP, stype)
+
+
+def ref_swish(x):
+    from scipy.special import expit
+    out = x * expit(x)
+    return out
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 45d60c8538e092f4c5d97f6525870af33a6ad9d5..9891da6ea21d9ac7c8c591f71183099858832140 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -289,6 +289,18 @@ class TestMatMulOp17(TestMatMulV2Op):
         self.trans_y = False
 
 
+class TestMatMulOp18(TestMatMulV2Op):
+    """
+    case 18 : for ppyoloe model
+    """
+
+    def config(self):
+        self.x_shape = (8, 111, 4, 17)
+        self.y_shape = (17)
+        self.trans_x = False
+        self.trans_y = False
+
+
 # class TestMatMulOpBroadcast1(TestMatMulV2Op):
 #     """
 #     case 14_3
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 8c1ce68e9d0f8b83b916ab45139745b224eff4f1..7a3b4a5a2179a94ed56ebf7968619c847c0e2090 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
 import sys
 sys.path.append("..")
+
+import paddle
+
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -158,390 +159,356 @@ def nearest_neighbor_interp3d_np(X,
     return out.astype(X.dtype)
 
 
-class TestNearestInterpOp(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
-            in_d = 1
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_d = 1
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
-            in_d = self.input_shape[2]
-            in_h = self.input_shape[3]
-            in_w = self.input_shape[4]
-        else:
-            in_d = self.input_shape[1]
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        scale_d = 0
-        scale_h = 0
-        scale_w = 0
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_d = scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_d = scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                if len(self.scale) == 5:
-                    scale_w = self.scale[2]
-                    scale_h = self.scale[1]
-                    scale_d = self.scale[0]
-                else:
-                    scale_w = self.scale[1]
-                    scale_h = self.scale[0]
+class XPUNearestInterpOpWrapper(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'nearest_interp_v2'
+        self.use_dynamic_create_class = False
 
-            out_h = int(in_h * scale_h)
-            out_w = int(in_w * scale_w)
-            out_d = int(in_d * scale_d)
-        else:
-            if len(self.input_shape) == 5:
-                out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
+    class TestNearestInterpOp(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
 
-        if len(self.input_shape) == 4:
-            output_np = nearest_neighbor_interp_np(
-                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
-                self.actual_shape, self.align_corners, self.data_layout)
-        elif len(self.input_shape) == 5:
-            output_np = nearest_neighbor_interp3d_np(
-                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
-                self.out_size, self.actual_shape, self.align_corners,
-                self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        if len(self.input_shape) == 5:
-            self.attrs = {
-                'out_d': self.out_d,
-                'out_h': self.out_h,
-                'out_w': self.out_w,
-                'interp_method': self.interp_method,
-                'align_corners': self.align_corners,
-                'data_layout': self.data_layout
-            }
-        else:
+            self.out_size = None
+            self.actual_shape = None
+            self.data_layout = 'NCHW'
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+            # in
+            if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+                in_d = 1
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+            else:
+                in_d = 1
+                in_h = self.input_shape[1]
+                in_w = self.input_shape[2]
+
+            if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+                in_d = self.input_shape[2]
+                in_h = self.input_shape[3]
+                in_w = self.input_shape[4]
+            else:
+                in_d = self.input_shape[1]
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+
+            # scale
+            scale_d = 0
+            scale_h = 0
+            scale_w = 0
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_d = scale_h = scale_w = float(self.scale)
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_d = scale_w = scale_h = self.scale[0]
+                    self.scale = [self.scale[0], self.scale[0]]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    if len(self.scale) == 5:
+                        scale_w = self.scale[2]
+                        scale_h = self.scale[1]
+                        scale_d = self.scale[0]
+                    else:
+                        scale_w = self.scale[1]
+                        scale_h = self.scale[0]
+
+                out_h = int(in_h * scale_h)
+                out_w = int(in_w * scale_w)
+                out_d = int(in_d * scale_d)
+            else:
+                if len(self.input_shape) == 5:
+                    out_d = self.out_d
+                out_h = self.out_h
+                out_w = self.out_w
+
+            # output_np
+            if len(self.input_shape) == 4:
+                output_np = nearest_neighbor_interp_np(
+                    input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                    self.actual_shape, self.align_corners, self.data_layout)
+            elif len(self.input_shape) == 5:
+                output_np = nearest_neighbor_interp3d_np(
+                    input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                    self.out_size, self.actual_shape, self.align_corners,
+                    self.data_layout)
+            self.outputs = {'Out': output_np}
+
+            self.inputs = {'X': input_np}
+            if self.out_size is not None:
+                self.inputs['OutSize'] = self.out_size
+            if self.actual_shape is not None:
+                self.inputs['OutSize'] = self.actual_shape
+
+            if len(self.input_shape) == 5:
+                self.attrs = {
+                    'out_d': self.out_d,
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+            else:
+                self.attrs = {
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+
+            if self.scale:
+                self.attrs['scale'] = self.scale
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 3, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.out_size = np.array([3, 3]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [4, 1, 1, 7, 8]
+            self.out_d = 1
+            self.out_h = 1
+            self.out_w = 1
+            self.scale = 0.
+            self.align_corners = True
+    """
+
+    class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+
+    class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+
+    class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [4, 1, 7, 8]
+            self.out_h = 1
+            self.out_w = 1
+            self.out_size = np.array([2, 2]).astype("int32")
+
+    class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = np.array([11, 11]).astype("int32")
+
+    class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([65, 129]).astype("int32")
+
+    class TestNearestNeighborInterpSame(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [2, 3, 32, 64]
+            self.out_h = 32
+            self.out_w = 64
+
+    class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support NHWC data_layout
+    class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [2, 4, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.scale = 0.
+            self.out_size = np.array([3, 8]).astype("int32")
+            self.align_corners = True
+            self.data_layout = "NHWC"
+    """
+
+    class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+        def set_align_corners(self):
+            self.align_corners = False
+
+    class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 5, 7]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 1.5
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [2.0, 3.0]
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [3, 2, 4, 7, 5]
+            self.out_d = 8
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [4.0, 2.0, 3.0]
+            self.out_size = np.array([8, 66, 40]).astype("int32")
+            self.align_corners = True
+    """
+
+    class TestNearestInterpOp_attr_tensor(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+
+            self.out_size = None
+            self.actual_shape = None
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            self.shape_by_1Dtensor = False
+            self.scale_by_1Dtensor = False
             self.attrs = {
-                'out_h': self.out_h,
-                'out_w': self.out_w,
                 'interp_method': self.interp_method,
                 'align_corners': self.align_corners,
-                'data_layout': self.data_layout
             }
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 1, 7, 8]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-"""
-
-
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support NHWC data_layout
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-"""
-
-
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [2.0, 3.0]
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighbor3DInterp(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 4, 7, 5]
-        self.out_d = 8
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [4.0, 2.0, 3.0]
-        self.out_size = np.array([8, 66, 40]).astype("int32")
-        self.align_corners = True
-"""
-
-
-class TestNearestInterpOp_attr_tensor(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
-            out_h = int(self.input_shape[2] * scale_h)
-            out_w = int(self.input_shape[3] * scale_w)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
 
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+            self.inputs = {'X': input_np}
+
+            if self.scale_by_1Dtensor:
+                self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+            elif self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_h = scale_w = float(self.scale)
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_w = scale_h = self.scale[0]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+                out_h = int(self.input_shape[2] * scale_h)
+                out_w = int(self.input_shape[3] * scale_w)
+            else:
+                out_h = self.out_h
+                out_w = self.out_w
+
+            if self.shape_by_1Dtensor:
+                self.inputs['OutSize'] = self.out_size
+            elif self.out_size is not None:
+                size_tensor = []
+                for index, ele in enumerate(self.out_size):
+                    size_tensor.append(("x" + str(index), np.ones(
+                        (1)).astype('int32') * ele))
+                self.inputs['SizeTensor'] = size_tensor
+
+            self.attrs['out_h'] = self.out_h
+            self.attrs['out_w'] = self.out_w
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    self.scale = [self.scale[0], self.scale[0]]
+                self.attrs['scale'] = self.scale
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+                self.align_corners)
+            self.outputs = {'Out': output_np}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 5, 4, 4]
+            self.out_h = 3
+            self.out_w = 3
+            self.out_size = [3, 3]
+
+    # out_size is a tensor list
+    class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = [8, 12]
+
+    # out_size is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+            self.shape_by_1Dtensor = True
+
+    # scale is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.0
+            self.out_size = None
+            self.scale_by_1Dtensor = True
+
+
+support_types = get_xpu_op_support_types('nearest_interp_v2')
+for stype in support_types:
+    create_test_class(globals(), XPUNearestInterpOpWrapper, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..785549abba8f3ad732dd49263532fcf453f0517b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.tensor as tensor
+import unittest
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from paddle.fluid.framework import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestTrilTriuOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tril_triu'
+        self.use_dynamic_create_class = False
+
+    class TestTrilTriuOp(XPUOpTest):
+        def setUp(self):
+            self.init_dtype()
+            self.initTestCase()
+            self.real_op_type = np.random.choice(['triu', 'tril'])
+            self.real_np_op = getattr(np, self.real_op_type)
+            self.set_xpu()
+            self.op_type = "tril_triu"
+            if self.dtype == np.int32:
+                self.X = np.arange(
+                    1, self.get_Xshape_prod() + 1,
+                    dtype=self.dtype).reshape(self.Xshape)
+            else:
+                self.X = np.random.random(self.Xshape).astype(dtype=self.dtype)
+            self.inputs = {'X': self.X}
+            self.attrs = {
+                'diagonal': self.diagonal,
+                'lower': True if self.real_op_type == 'tril' else False,
+            }
+            self.outputs = {
+                'Out': self.real_np_op(self.X, self.diagonal)
+                if self.diagonal else self.real_np_op(self.X)
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def get_Xshape_prod(self):
+            ret = 1
+            for v in self.Xshape:
+                ret *= v
+            return ret
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.real_op_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+        def initTestCase(self):
+            self.diagonal = None
+            self.Xshape = (10, 10)
+
+    class TestTrilTriuOp1(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -3
+            self.Xshape = (5, 5)
+
+    class TestTrilTriuOp2(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 4
+            self.Xshape = (11, 17)
+
+    class TestTrilTriuOp3(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 10
+            self.Xshape = (25, 25)
+
+    class TestTrilTriuOp4(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -10
+            self.Xshape = (33, 11)
+
+    class TestTrilTriuOp5(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 11
+            self.Xshape = (1, 99)
+
+
+class TestTrilTriuOpError(unittest.TestCase):
+    def test_errors1(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(20, 22), dtype='float32', name="data1")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "diagonal: TypeError":
+            "diagonal in {} must be a python Int".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal='2022')
+
+    def test_errors2(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(200, ), dtype='float32', name="data2")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "input: ValueError":
+            "x shape in {} must be at least 2-D".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal=[None])
+
+
+support_types = get_xpu_op_support_types('tril_triu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTrilTriuOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 94b8bd29b2c19b84323152c9566d4017ae4772c5..f2d41b5e9b1f079d2ca0956bd4ce6e014ba004d7 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -46,6 +46,10 @@ def _build_saved_state_dict(state_dict):
             if value.type == core.VarDesc.VarType.VOCAB:
                 save_dict[key] = value.value().get_map_tensor()
             else:
+                if not value.value().get_tensor()._is_initialized():
+                    raise ValueError(
+                        "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model."
+                    )
                 save_dict[key] = value.numpy()
             name_table[key] = value.name
         else:
@@ -466,7 +470,9 @@ def _parse_load_result(obj, return_numpy):
 
 def _save_lod_tensor(tensor, file_name):
     if not tensor._is_initialized():
-        raise ValueError("The saved tensor is not initialized.")
+        raise ValueError(
+            "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model firstly."
+        )
     if _is_file_path(file_name):
         _seek = core.save_lod_tensor(tensor, file_name)
         # '_seek' is the end position of this tensor in the file.
diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27c23be3a89411da702e3071fec4c99186fca4b9
--- /dev/null
+++ b/python/paddle/incubate/multiprocessing/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .reductions import init_reductions
+import multiprocessing
+
+__all__ = []
+
+from multiprocessing import *  # noqa: F403
+
+__all__ += multiprocessing.__all__  # type: ignore[attr-defined]
+
+# Only support linux for now
+# Only support file_system sharing strategy.
+
+init_reductions()
diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbc55afd3bca87aa279c7aa251aa23671b1a317
--- /dev/null
+++ b/python/paddle/incubate/multiprocessing/reductions.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+# TODO: check the hooks of tensor
+# TODO: check serializing named tensor
+# TODO: check influence on autograd
+import os
+import sys
+import warnings
+import math
+import copy
+import threading
+import multiprocessing
+from multiprocessing.util import register_after_fork
+from multiprocessing.reduction import ForkingPickler
+
+from collections import OrderedDict
+
+
+def _supported_check():
+    if sys.platform != "linux":
+        # warnings.warn("`paddle.multiprocessing` only support linux for now, "
+        #               " import this will not take any effect !")
+
+        return False
+
+    if not sys.version_info >= (3, 4):
+        warnings.warn("Use `paddle.multiprocessing` to share paddle tensor "
+                      "requires python version greater than 3.4 ."
+                      " `paddle.multiprocessing` will not take any effect !!!")
+        return False
+
+    return True
+
+
+class LRUSharedCache(OrderedDict):
+    def __init__(self):
+        self.limit = 128
+        self._after_fork()
+        register_after_fork(self, LRUSharedCache._after_fork)
+
+    def _after_fork(self):
+        self.lock = threading.Lock()
+
+    def get(self, key):
+        with self.lock:
+            try:
+                value = super().pop(key)
+                super().__setitem__(key, value)
+                return value
+            except KeyError:
+                return None
+
+    def __setitem__(self, key, value):
+        with self.lock:
+            try:
+                super().__delitem__(key)
+            except KeyError:
+                if len(self) >= self.limit:
+                    super().popitem(last=False)
+            super().__setitem__(key, value)
+
+
+shared_cache = LRUSharedCache()
+
+
+def cuda_from_cache(key):
+    lodtensor = shared_cache.get(key)
+    if lodtensor is None:
+        return None
+    return lodtensor
+
+
+def rebuild_tensor(cls, lodtensor, metadata):
+    if cls == paddle.fluid.framework.ParamBase:
+        tensor = paddle.fluid.framework.ParamBase(lodtensor.shape(),
+                                                  lodtensor._dtype(),
+                                                  **metadata)
+        tensor.value().get_tensor()._share_data_with(lodtensor)
+    else:
+        size, stop_gradient = metadata
+        tensor = paddle.fluid.core.VarBase()
+        if lodtensor._is_initialized():
+            tensor.value().get_tensor()._share_data_with(lodtensor)
+        else:
+            tensor = paddle.to_tensor([], dtype=lodtensor._dtype())
+        tensor.stop_gradient = stop_gradient
+    return tensor
+
+
+def reduce_tensor(tensor):
+    lodtensor = tensor.value().get_tensor()
+
+    if not tensor.stop_gradient and not tensor.is_leaf:
+        raise RuntimeError(
+            "Refusing to serialize non-leaf tensor which not stop_gradient, you can detach it!"
+        )
+    # TODO: add serializing name and  hooks check
+    if tensor.place.is_cpu_place() or tensor.place.is_gpu_place(
+    ) or tensor.place.is_cuda_pinned_place():
+        if type(tensor) == paddle.fluid.framework.ParamBase:
+            metadata = copy.deepcopy(tensor.__dict__)
+        else:
+            metadata = (tensor.size, tensor.stop_gradient)
+
+        return (rebuild_tensor, (type(tensor), lodtensor, metadata))
+    else:
+        raise ValueError(
+            "Only support tensors of CPU/CUDA/CUDAPinned Place, Not support %s for now!"
+            % tensor.place)
+
+
+def rebuild_lodtensor_filename(cls, ipc_name, size, type_idx, dims, lod):
+    lodtensor = cls._new_shared_filename((ipc_name, size, type_idx, dims, lod))
+    lodtensor._shared_decref()
+    return lodtensor
+
+
+def rebuild_cuda_tensor(cls, handle, offset_bytes, size, type_idx, dims, lod,
+                        device_idx):
+    cache_tensor = cuda_from_cache((handle, offset_bytes))
+    if cache_tensor is None:
+        lodtensor = cls._new_shared_cuda(
+            (handle, offset_bytes, size, type_idx, dims, lod, device_idx))
+        # We only cache cuda shared tensor here.
+        # The opening cost of cudaIpcMemoryHandle is very high.
+        # Since we cache the recived tensor directly,
+        # The sender may reallocate the tensor space,
+        # you should manualy maintian the lifecycle of ipc tensor
+        shared_cache[(handle, offset_bytes)] = lodtensor
+    else:
+        lodtensor = paddle.fluid.core.LoDTensor()
+        lodtensor._share_buffer_with(cache_tensor,
+                                     (size, type_idx, dims, lod, device_idx))
+
+    return lodtensor
+
+
+def rebuild_lodtensor_empty(cls):
+    #TODO: check if tensor initialized
+    #TODO: handle the dtype of empty tensor
+    return cls()
+
+
+def reduce_lodtensor(lodtensor):
+    if lodtensor._place().is_cpu_place() or lodtensor._place(
+    ).is_cuda_pinned_place():
+        for dim in lodtensor.shape():
+            if dim == 0:
+                # Empty tensors have nothing be mmapped.
+                return (rebuild_lodtensor_empty, (type(lodtensor), ))
+
+        # Default use share filename stratege
+        metadata = lodtensor._share_filename(
+        )  # ipc_name, size, type_idx, dims, lod
+        rebuild = rebuild_lodtensor_filename
+        lodtensor._shared_incref()
+        # TODO, maintain reference for lodtensor
+        # TODO: support file_discriptor stratege
+    elif lodtensor._place().is_gpu_place():
+        metadata = lodtensor._share_cuda()
+        rebuild = rebuild_cuda_tensor
+    else:
+        raise RuntimeError("We only support pass cpu/gpu lodtensor for now!")
+
+    return (rebuild, (type(lodtensor), ) + metadata)
+
+
+def init_reductions():
+    if not _supported_check():
+        return
+
+    ForkingPickler.register(paddle.Tensor, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.core.VarBase, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.framework.ParamBase, reduce_tensor)
+    ForkingPickler.register(paddle.fluid.core.LoDTensor, reduce_lodtensor)
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index d600cda8454cc696579df7fa7f6e6f4d6ae12600..457422ae3a4d602a6782d0949030fb3b3fb797c2 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -223,12 +223,14 @@ def fused_multi_head_attention(x,
                                pre_ln_epsilon=1e-05,
                                qkv_bias=None,
                                linear_bias=None,
+                               cache_kv=None,
                                attn_mask=None,
                                dropout_rate=0.5,
                                attn_dropout_rate=0.5,
                                ln_epsilon=1e-05,
                                training=True,
                                mode='upscale_in_train',
+                               ring_id=-1,
                                name=None):
     r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
@@ -242,8 +244,8 @@ def fused_multi_head_attention(x,
     	    out = layer_norm(x)
             out = linear(out) + qkv) + bias
     	else:
-	    out = linear(x) + bias
-    	out = transpose(out, perm=[2, 0, 3, 1, 4])
+            out = linear(x) + bias
+            out = transpose(out, perm=[2, 0, 3, 1, 4])
     	# extract q, k and v from out.
     	q = out[0:1,::]
     	k = out[1:2,::]
@@ -257,8 +259,8 @@ def fused_multi_head_attention(x,
     	out = out_linear(out)
     	if pre_layer_norm:
     	    out = x + dropout(linear_bias + out)
-	else:
-    	    out = layer_norm(x + dropout(linear_bias + out))
+        else:
+            out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -276,6 +278,7 @@ def fused_multi_head_attention(x,
         qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`.
             Default None.
         linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        cache_kv (Tensor, optional): For generation model, cache structure. The shape is `[2, bsz, num_head, seq_len, head_dim]`. Default None.
         attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to
  	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor
             with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the
@@ -303,6 +306,7 @@ def fused_multi_head_attention(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in mp, only support NCCL and forward. Default is -1, means not using mp
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -333,7 +337,7 @@ def fused_multi_head_attention(x,
             output = F.fused_multi_head_attention(
                 x, qkv_weight, linear_weight, False,
                 None, None, None, None, 1e-5, qkv_bias,
-                linear_bias, attn_mask)
+                linear_bias, None, attn_mask)
             # [2, 4, 128]
             print(output.shape)
     """
@@ -359,17 +363,20 @@ def fused_multi_head_attention(x,
         assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
             3], "embed_dim must be divisible by num_heads."
 
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, final_out = _C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, attn_mask,
-            linear_weight, linear_bias, ln_scale, ln_bias, 'pre_layer_norm',
-            pre_layer_norm, 'epsilon', pre_ln_epsilon, 'dropout_rate',
-            dropout_rate, 'attn_dropout_rate', attn_dropout_rate, 'ln_epsilon',
-            ln_epsilon, 'attn_dropout_is_test', not training, 'dropout_is_test',
-            not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
+        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _C_ops.fused_attention(
+            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
+            attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
+            'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
+            'dropout_rate', dropout_rate, 'attn_dropout_rate',
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
+            not training, 'dropout_is_test', not training,
+            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
+            seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
-            'dropout_implementation', mode)
+            'dropout_implementation', mode, 'ring_id', ring_id)
+        if cache_kv is not None:
+            return final_out, cache_kv_out
         return final_out
     else:
         helper = LayerHelper('fused_multi_head_attention', **locals())
@@ -398,6 +405,7 @@ def fused_multi_head_attention(x,
             inputs['Ln2Scale'] = [ln_scale]
         if ln_bias:
             inputs['Ln2Bias'] = [ln_bias]
+        if cache_kv: inputs['CacheKV'] = [cache_kv]
 
         if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
             seed = helper.main_program.random_seed
@@ -417,6 +425,7 @@ def fused_multi_head_attention(x,
             'dropout_seed': seed if seed is not None else 0,
             'attn_dropout_implementation': mode,
             'dropout_implementation': mode,
+            'ring_id': ring_id
         }
 
         # set outputs
@@ -449,6 +458,7 @@ def fused_multi_head_attention(x,
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
             dtype=dtype)
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
+        cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
         helper.append_op(
             type='fused_attention',
@@ -472,7 +482,9 @@ def fused_multi_head_attention(x,
                 "Ln2Mean": ln_mean_out,
                 "Ln2Variance": ln_variance_out,
                 "BiasDropoutResidualOut": bias_dropout_residual_out,
-                'Y': final_out
+                'Y': final_out,
+                'CacheKVOut': cache_kv_out
             },
             attrs=attrs)
-        return final_out
+
+        return (final_out, cache_kv_out) if cache_kv else final_out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index ed668ed124c2318c88cde2bd4acb31ce2b2e4f7c..9e78ca6be3f2749e43963f63cdb8b6983f651697 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -351,7 +351,6 @@ def interpolate(x,
 
     out_shape = size
     scale = scale_factor
-
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
@@ -362,6 +361,8 @@ def interpolate(x,
             if in_dynamic_mode():
                 if isinstance(out_shape, Variable):
                     out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
                 for i, dim in enumerate(out_shape):
                     if isinstance(dim, Variable):
                         out_shape[i] = dim.numpy()[0]
@@ -1818,7 +1819,6 @@ def fold(x,
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1826,21 +1826,21 @@ def fold(x,
     Parameters:
         x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
                                   data type can be float32 or float64
-        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+        output_sizes(int|list|tuple):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1859,9 +1859,9 @@ def fold(x,
             import paddle
             import paddle.nn.functional as F
 
-            x = paddle.randn([2,12,9])
-            y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2)
-            # y.shape = [2,3,4,4]
+            x = paddle.randn([2,3*2*2,12])
+            y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
+            # y.shape = [2,3,4,5]
 
     """
 
@@ -1872,29 +1872,32 @@ def fold(x,
     assert len(x.shape) == 3, \
             "input should be the format of [N, C, L]"
 
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
     if isinstance(output_sizes, int):
         output_sizes = [output_sizes, output_sizes]
     else:
-        assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \
-            "output_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \
+            "output_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(strides) and (len(strides) == 2), \
+            "strides should either be an integer or a list/tuple of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list/tuple of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -1912,16 +1915,21 @@ def fold(x,
             "Unexpected type of paddings, it should be either an integer or a list"
             "of 2 or 4 integers")
 
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="fold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "output_sizes": output_sizes,
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
+    if in_dynamic_mode():
+        out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes",
+                          kernel_sizes, "strides", strides, "paddings",
+                          paddings, "dilations", dilations)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type="fold",
+            inputs={"X": x},
+            outputs={"Y": out},
+            attrs={
+                "output_sizes": output_sizes,
+                "kernel_sizes": kernel_sizes,
+                "strides": strides,
+                "paddings": paddings,
+                "dilations": dilations
+            })
     return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index de8a7ff6d3c7b6cd87d6301f2cd0bb7af119a74d..4c30ed03735f26b6df77c6a8f5b32391972738e5 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,6 +19,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
 from paddle import in_dynamic_mode
+from paddle.framework import _in_eager_mode
 
 __all__ = []
 
@@ -87,6 +88,8 @@ def one_hot(x, num_classes, name=None):
     """
 
     if in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_one_hot(x, num_classes)
         return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range',
                                  False)
     else:
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 19fbcd5b6f85691e57530a442d9f72ce7935692d..dac4cf5f2725333952d3710df3c5629d6566197f 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1565,7 +1565,6 @@ class Fold(Layer):
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1573,19 +1572,19 @@ class Fold(Layer):
     Parameters:
         output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1604,10 +1603,10 @@ class Fold(Layer):
             import paddle
             import paddle.nn as nn
 
-            x = paddle.randn([2,12,9])
-            fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2)
+            x = paddle.randn([2,3*2*2,12])
+            fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2)
             y = fold(x)
-            # y.shape = [2,3,4,4]
+            # y.shape = [2,3,4,5]
    """
 
     def __init__(self,
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 47dc02705f80bee3ce614846a82c7e44140247b1..96f35eb9d27ec86baa9a7311a4a85a217a7499b8 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,6 +42,7 @@ from .. import compat as cpt
 from .lr import LRScheduler
 import copy
 from paddle import _C_ops
+from paddle.fluid.framework import _in_eager_mode
 
 __all__ = []
 
@@ -1108,7 +1109,13 @@ class Optimizer(object):
                 for p in param_group['params']:
                     if not p.stop_gradient:
                         param_list.append(p)
-        core.clear_gradients(param_list, set_to_zero)
+
+        if _in_eager_mode():
+            for p in param_list:
+                clear_func = p._zero_grads if set_to_zero else p.clear_gradient
+                clear_func()
+        else:
+            core.clear_gradients(param_list, set_to_zero)
 
     @imperative_base.no_grad
     def minimize(self,
diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4999e703f2a5a31be2cd5c20b70bc7b9dfb7e60a
--- /dev/null
+++ b/python/paddle/profiler/__init__.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .profiler import ProfilerState, ProfilerTarget
+from .profiler import make_scheduler, export_chrome_tracing, export_protobuf
+from .profiler import Profiler
+from .profiler import TracerEventType
+from .utils import RecordEvent, load_profiler_result
+from .profiler_statistic import SortedKeys
+
+__all__ = [
+    'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
+    'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
+    'load_profiler_result', 'SortedKeys'
+]
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc637bf983046b8025962257744b0e1bb4763b4b
--- /dev/null
+++ b/python/paddle/profiler/profiler.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+import datetime
+from enum import Enum
+from typing import Any, Callable, Iterable, Optional, Union
+from warnings import warn
+
+import paddle
+from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
+                               TracerEventType)
+
+from .utils import RecordEvent, wrap_optimizers
+from .profiler_statistic import SortedKeys
+
+
+class ProfilerState(Enum):
+    r"""
+    Profiler state that can be specified to control profiler action.
+
+    CLOSED: The profilers are closed.
+    READY:  The profilers are open, but the data will not be recorded.
+            This state is used for reducing overhead influence when profilers start.
+    RECORD: The profilers are open, and the data will be recorded.
+    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, 
+            the collected data will be returned.
+    """
+    CLOSED = 0
+    READY = 1
+    RECORD = 2
+    RECORD_AND_RETURN = 3  # the last step of RECORD 
+
+
+class ProfilerTarget(Enum):
+    r"""
+    Target device for profiling.
+    """
+    CPU = 0
+    GPU = 1
+
+
+def make_scheduler(*,
+                   closed: int,
+                   ready: int,
+                   record: int,
+                   repeat: int=0,
+                   skip_first: int=0) -> Callable:
+    r"""
+    Return a scheduler function, which scheduler the state according to the setting.
+    The state transform confirms to:
+
+    (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
+    START -> skip_first -> closed -> ready    ->    record       ->      END
+                            |                        |
+                            |                        | (if has_repeated < repeat)
+                            - - - - - - - - - - - -
+    Note that repeat <= 0 means the cycle will continue until the profiler exits.    
+
+    Parameters:
+        closed(int): The number of steps in state ProfilerState.CLOSED.
+        ready(int):  The number of steps in state ProfilerState.READY. 
+        record(int): The number of steps in state ProfilerState.RECORD.    
+        repeat(int): The number of cycles to repeat above state transform.
+        skip_first(int): The number of first steps to drop, not participate in the state transform.
+
+    Returns:
+        A scheduler function, conforms to above state transform setting.
+
+    Examples:
+        1. profiling range [2, 5]
+        batch 0: closed, batch 1: ready, batch [2, 5] record
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, repeat=1)
+        2. profiling range [3,6], [9,12], [15,18]...
+        batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, skip_first=1)
+    """
+
+    def getScheduleState(step: int) -> ProfilerState:
+        assert step >= 0
+        if step < skip_first:  # within skip_first, just skip
+            return ProfilerState.CLOSED
+        step = step - skip_first
+        period_steps = closed + ready + record
+        has_repeated = step // period_steps
+        if repeat > 0 and has_repeated >= repeat:  # the period has repeated repeat times, return CLOSED state
+            return ProfilerState.CLOSED
+        mod_step = step % period_steps
+        if mod_step < closed:
+            return ProfilerState.CLOSED
+        elif mod_step >= closed and mod_step < closed + ready:
+            return ProfilerState.READY
+        else:
+            if mod_step < period_steps - 1:
+                return ProfilerState.RECORD
+            else:
+                return ProfilerState.RECORD_AND_RETURN
+    assert closed >= 0 and ready >= 0 and record > 0 and \
+             repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments"
+    if ready == 0:
+        warn("Profiler will record data after enabling profiler immediately, \
+          some data collected at the beginning of profiling may be 'noisy' because of overhead."
+             )
+    return getScheduleState
+
+
+def _default_state_scheduler(step: int):
+    r"""
+    A default state scheduler, keep recording from the begining of the profiler until ending.
+    """
+    return ProfilerState.RECORD
+
+
+def export_chrome_tracing(dir_name: str,
+                          worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to chrome tracing format file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.json'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "json")
+
+    return handle_fn
+
+
+def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to protobuf file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_protobuf('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.pb'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "pb")
+
+    return handle_fn
+
+
+def _get_supported_targets() -> Iterable[ProfilerTarget]:
+    r"""
+    Get the current supported profiler target in the system.
+    """
+    if paddle.device.is_compiled_with_cuda():
+        return [ProfilerTarget.CPU, ProfilerTarget.GPU]
+    return [ProfilerTarget.CPU]
+
+
+class Profiler:
+    r"""
+    Profiler context manager, user interface to manage profile process.
+
+    Parameters:
+        targets (iterable): list of tracing targets, currently supported values:
+        ``paddle.profiler.ProfilerTarget.CPU``,
+        ``paddle.profiler.ProfilerTarget.GPU``.
+        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. 
+            If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+            which means profiling range [start_batch, end_batch).
+        on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
+            This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+            
+    Examples:
+        1. profiling range [2, 5)
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (2, 5),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        2. profiling range [2,4], [7, 9], [11,13]
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        3. Use profiler without context manager, and use default parameters
+        .. code-block:: python
+        import paddle.profiler as profiler
+        p = profiler.Profiler()
+        p.start()
+        for iter in range(N):
+            train()
+            p.step()
+        p.stop()
+        p.summary()
+    """
+
+    def __init__(
+            self,
+            *,
+            targets: Optional[Iterable[ProfilerTarget]]=None,
+            scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None,
+            on_trace_ready: Optional[Callable[..., Any]]=None):
+        supported_targets = _get_supported_targets()
+        if targets:
+            self.targets = set(targets)
+            for target in targets:
+                if target not in supported_targets:
+                    self.targets.remove(target)
+                    warn("Profiling {} is not supported in current context.".
+                         format(target))
+        else:
+            self.targets = supported_targets
+        profileoption = ProfilerOptions()
+        if ProfilerTarget.CPU in self.targets:
+            profileoption.trace_switch |= 1
+        if ProfilerTarget.GPU in self.targets:
+            profileoption.trace_switch |= (1 << 1)
+        wrap_optimizers()
+        self.profiler = _Profiler.create(profileoption)
+        if callable(scheduler):
+            self.scheduler = scheduler
+        elif isinstance(scheduler, (tuple, list)):
+            assert len(scheduler) == 2 and scheduler[1] > scheduler[0]
+            start_batch, end_batch = scheduler
+            start_batch = max(start_batch, 0)
+            if start_batch >= 1:
+                self.scheduler = make_scheduler(
+                    closed=max(start_batch - 1, 0),
+                    ready=1,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+            else:
+                self.scheduler = make_scheduler(
+                    closed=0,
+                    ready=0,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+        else:
+            self.scheduler = _default_state_scheduler
+
+        if on_trace_ready == None:
+            self.on_trace_ready = export_chrome_tracing('./profiler_log/')
+        else:
+            self.on_trace_ready = on_trace_ready
+        self.step_num = 0
+        self.previous_state = ProfilerState.CLOSED
+        self.current_state = self.scheduler(self.step_num)
+        self.record_event = None
+        self.profiler_result = None
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+
+    def start(self):
+        r'''
+        Start profiler and enter the first profiler step(0).
+        State transformed from CLOSED to self.current_state and trigger corresponding action. 
+        '''
+        # CLOSED -> self.current_state
+        if self.current_state == ProfilerState.READY:
+            self.profiler.prepare()
+        elif self.current_state == ProfilerState.RECORD:
+            self.profiler.prepare()
+            self.profiler.start()
+        elif self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler.prepare()
+            self.profiler.start()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def stop(self):
+        r'''
+        Stop profiler and State transformed from self.current_state to CLOSED.
+        Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
+        '''
+        # self.current_state -> CLOSED
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        if self.current_state == ProfilerState.READY:
+            warn(
+                "Inproper Profiler state transform: READY->CLOSED, profiler will start and stop without saving data"
+            )
+            self.profiler.start()
+            self.profiler.stop()
+        if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler_result = self.profiler.stop()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def step(self):
+        r"""
+        Signals the profiler that the next profiling step has started.
+        Get the new ProfilerState and trigger corresponding action.
+        """
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        self.previous_state = self.current_state
+        self.step_num += 1
+        self.current_state = self.scheduler(self.step_num)
+        self._trigger_action()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def _trigger_action(self):
+        if self.previous_state == ProfilerState.CLOSED:
+            if self.current_state == ProfilerState.READY:  # CLOSED -> READY
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # CLOSED -> RECORD
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # CLOSED -> RECORD_AND_RETURN
+                self.profiler.prepare()
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.READY:
+            if self.current_state == ProfilerState.CLOSED:  # READY -> CLOSED
+                warn(
+                    "Improper schedule: READY->CLOSED, profiler will start and stop without saving data"
+                )
+                self.profiler.start()
+                self.profiler.stop()
+            if self.current_state == ProfilerState.RECORD:  # READY -> RECORD
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # READY -> RECORD_AND_RETURN
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.RECORD:
+            if self.current_state == ProfilerState.CLOSED:  # RECORD -> CLOSED
+                warn(
+                    "Improper schedule: RECORD->CLOSED, profiler will not saving data"
+                )
+                self.profiler.stop()
+
+            if self.current_state == ProfilerState.READY:  # RECORD -> READY
+                warn(
+                    "Improper schedule: RECORD->READY, profiler will stop and re-prepare"
+                )
+                self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD -> RECORD_AND_RETURN
+                pass
+
+        else:
+            assert self.previous_state == ProfilerState.RECORD_AND_RETURN
+            if self.current_state == ProfilerState.CLOSED:  # RECORD_AND_RETURN -> CLOSED
+                self.profiler_result = self.profiler.stop()
+            if self.current_state == ProfilerState.READY:  # RECORD_AND_RETURN -> READY
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # RECORD_AND_RETURN -> RECORD
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD_AND_RETURN -> RECORD_AND_RETURN
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def export(self, path="", format="json"):
+        r"""
+        Exports the tracing data in Chrome tracing data format.
+        """
+        if self.profiler_result:
+            self.profiler_result.save(path, format)
+
+    def summary(self,
+                sorted_by=SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'):
+        r"""
+        Print the Summary table.
+
+        Parameters:
+            sorted_by: how to rank the op table items.
+            detail: expand each operator detail information.
+            thread_sep: print op table each thread.
+            time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
+        """
+        pass
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
new file mode 100755
index 0000000000000000000000000000000000000000..7400f21e91365efeaef6a03d008691bdc837131b
--- /dev/null
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -0,0 +1,824 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+from enum import Enum
+
+from paddle.fluid.core import TracerEventType
+
+from .statistic_helper import *
+
+_AllTracerEventType = [
+    TracerEventType.Operator, TracerEventType.Dataloader,
+    TracerEventType.ProfileStep, TracerEventType.CudaRuntime,
+    TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset,
+    TracerEventType.UserDefined, TracerEventType.OperatorInner,
+    TracerEventType.Forward, TracerEventType.Backward,
+    TracerEventType.Optimization, TracerEventType.Communication,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
+
+
+class SortedKeys(Enum):
+    r"""
+    Sorted keys for printing summary table.
+    """
+    CPUTotal = 0
+    CPUAvg = 1
+    CPUMax = 2
+    CPUMin = 3
+    GPUTotal = 4
+    GPUAvg = 5
+    GPUMax = 6
+    GPUMin = 7
+
+
+class HostStatisticNode:
+    r'''
+    Wrap original node for calculating statistic metrics.
+    '''
+
+    def __init__(self, hostnode):
+        self.hostnode = hostnode
+        self.children_node = []
+        self.runtime_node = []
+        self.cpu_time = 0
+        self.self_cpu_time = 0
+        self.gpu_time = 0
+        self.self_gpu_time = 0
+
+    def cal_statistic(self):
+        for child in self.children_node:
+            child.cal_statistic()
+        for rt in self.runtime_node:
+            rt.cal_statistic()
+
+        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
+        for child in self.children_node:
+            self.gpu_time += child.gpu_time
+            self.self_cpu_time -= (child.end_ns - child.start_ns)
+        for rt in self.runtime_node:
+            self.self_cpu_time -= (rt.end_ns - rt.start_ns)
+            self.gpu_time += rt.gpu_time
+            self.self_gpu_time += rt.gpu_time
+        for device in self.hostnode.device_node:
+            self.gpu_time += (device.end_ns - device.start_ns)
+            self.self_gpu_time += (device.end_ns - device.start_ns)
+
+    @property
+    def end_ns(self):
+        return self.hostnode.end_ns
+
+    @property
+    def start_ns(self):
+        return self.hostnode.start_ns
+
+    def __getattr__(self, name):
+        return getattr(self.hostnode, name)
+
+
+def traverse_tree(nodetrees):
+    results = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        threadlist = results[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+    return results
+
+
+def wrap_tree(nodetrees):
+    '''
+    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
+    '''
+    node_statistic_tree = {}
+    results = collections.defaultdict(list)
+    newresults = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        root_statistic_node = HostStatisticNode(rootnode)
+        newstack = []
+        newstack.append(root_statistic_node)
+        node_statistic_tree[thread_id] = root_statistic_node
+        threadlist = results[thread_id]
+        newthreadlist = newresults[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            current_statistic_node = newstack.pop()
+            newthreadlist.append(current_statistic_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+                child_statistic_node = HostStatisticNode(childnode)
+                current_statistic_node.children_node.append(
+                    child_statistic_node)
+                newstack.append(child_statistic_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_statistic_node = HostStatisticNode(runtimenode)
+                current_statistic_node.runtime_node.append(
+                    runtime_statistic_node)
+    # recursive calculate node statistic values
+    for thread_id, root_statistic_node in node_statistic_tree.items():
+        root_statistic_node.cal_statistic()
+
+    return node_statistic_tree, newresults
+
+
+class TimeRangeSummary:
+    r"""
+    Analyse time ranges for each TracerEventType, and summarize the time.
+    """
+
+    def __init__(self):
+        self.CPUTimeRange = collections.defaultdict(list)
+        self.GPUTimeRange = collections.defaultdict(
+            lambda: collections.defaultdict(list)
+        )  # GPU events should be divided into different devices
+        self.CPUTimeRangeSum = collections.defaultdict(int)
+        self.GPUTimeRangeSum = collections.defaultdict(
+            lambda: collections.defaultdict(int))
+        self.call_times = collections.defaultdict(int)
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis node trees in profiler result, and get time range for different tracer event type.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            CPUTimeRange = collections.defaultdict(list)
+            GPUTimeRange = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
+            )  # device_id/type/stream_id
+            for hostnode in hostnodes[1:]:  #skip root node
+                CPUTimeRange[hostnode.type].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.call_times[hostnode.type] += 1
+                if hostnode.type == TracerEventType.Operator and any([
+                        name in hostnode.name for name in _CommunicationOpName
+                ]):  # special case, communication op
+                    CPUTimeRange[TracerEventType.Communication].append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    self.call_times[TracerEventType.Communication] += 1
+                is_communication_node = (
+                    hostnode.type == TracerEventType.Communication
+                ) or (hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in _CommunicationOpName]))
+                for runtimenode in hostnode.runtime_node:
+                    CPUTimeRange[runtimenode.type].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                    self.call_times[runtimenode.type] += 1
+                    for devicenode in runtimenode.device_node:
+                        GPUTimeRange[devicenode.device_id][devicenode.type][
+                            devicenode.stream_id].append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        self.call_times[devicenode.type] += 1
+                        if is_communication_node:  # gpu activity for communication node
+                            GPUTimeRange[devicenode.device_id][
+                                TracerEventType.Communication][
+                                    devicenode.stream_id].append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                            self.call_times[TracerEventType.Communication] += 1
+
+            for event_type, time_ranges in CPUTimeRange.items():
+                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
+                self.CPUTimeRange[event_type] = merge_ranges(
+                    self.CPUTimeRange[event_type], time_ranges, is_sorted=True)
+            for device_id, device_time_ranges in GPUTimeRange.items():
+                for event_type, event_time_ranges in device_time_ranges.items():
+                    for stream_id, time_ranges in event_time_ranges.items():
+                        time_ranges = merge_self_ranges(
+                            time_ranges, is_sorted=False)
+                        self.GPUTimeRange[device_id][event_type] = merge_ranges(
+                            self.GPUTimeRange[device_id][event_type],
+                            time_ranges,
+                            is_sorted=True)
+
+        for event_type, time_ranges in self.CPUTimeRange.items():
+            self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
+        for device_id, device_time_ranges in self.GPUTimeRange.items():
+            for event_type, time_ranges in device_time_ranges.items():
+                self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
+                    time_ranges)
+
+    def get_gpu_devices(self):
+        return self.GPUTimeRange.keys()
+
+    def get_gpu_range_sum(self, device_id, event_type):
+        return self.GPUTimeRangeSum[device_id][event_type]
+
+    def get_cpu_range_sum(self, event_type):
+        return self.CPUTimeRangeSum[event_type]
+
+
+class EventSummary:
+    r"""
+    Analyse operator event in profiling data, correlate with its device event.
+    """
+
+    class DeviceItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_item(self, node):
+            self.call += 1
+            self.add_gpu_time(node.end_ns - node.start_ns)
+
+    class OperatorItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.gpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+            self.devices = {}
+            self.operator_inners = {}
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+            for child in node.children_node:
+                if child.name not in self.operator_inners:
+                    self.operator_inners[
+                        child.name] = EventSummary.OperatorItem(child.name)
+                self.operator_inners[child.name].add_item(child)
+
+            for runtimenode in node.runtime_node:
+                for devicenode in runtimenode.device_node:
+                    if devicenode.name not in self.devices:
+                        self.devices[devicenode.name] = EventSummary.DeviceItem(
+                            devicenode.name)
+                    self.devices[devicenode.name].add_item(devicenode)
+
+    class GeneralItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+
+    def __init__(self):
+        self.items = {}  # for operator summary
+        self.thread_items = collections.defaultdict(
+            dict)  # for operator summary
+        self.userdefined_items = {}  # for userdefined summary
+        self.userdefined_thread_items = collections.defaultdict(
+            dict)  # for userdefined summary
+        self.model_perspective_items = {}  # for model summary
+        self.memory_manipulation_items = {}  # for memory manipulation summary
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis operator event in the nodetress.
+        """
+        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
+        for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
+        ):
+            for host_statistic_node in host_statistic_nodes[
+                    1:]:  #skip root node
+                if host_statistic_node.type == TracerEventType.Operator:
+                    self.add_operator_item(host_statistic_node)
+                if host_statistic_node.type == TracerEventType.UserDefined\
+                    or host_statistic_node.type == TracerEventType.PythonUserDefined:
+                    if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\
+                        or 'memset' in host_statistic_node.name.lower():
+                        self.add_memory_manipulation_item(host_statistic_node)
+                    else:
+                        self.add_userdefined_item(host_statistic_node)
+
+        for threadid, root_statistic_node in node_statistic_trees.items():
+            deque = collections.deque()
+            deque.append(root_statistic_node)
+            while deque:
+                current_node = deque.popleft()
+                for child in current_node.children_node:
+                    if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\
+                        or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization:
+                        self.add_model_perspective_item(
+                            child)  #find first model perspective node
+                    else:
+                        deque.append(child)
+
+    def add_operator_item(self, operator_node):
+        if operator_node.name not in self.items:
+            self.items[operator_node.name] = EventSummary.OperatorItem(
+                operator_node.name)
+
+        self.items[operator_node.name].add_item(operator_node)
+
+        if operator_node.name not in self.thread_items[operator_node.thread_id]:
+            self.thread_items[operator_node.thread_id][
+                operator_node.name] = EventSummary.OperatorItem(
+                    operator_node.name)
+        self.thread_items[operator_node.thread_id][operator_node.name].add_item(
+            operator_node)
+
+    def add_userdefined_item(self, userdefined_node):
+        if userdefined_node.name not in self.userdefined_items:
+            self.userdefined_items[
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+
+        self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
+
+        if userdefined_node.name not in self.userdefined_thread_items[
+                userdefined_node.thread_id]:
+            self.userdefined_thread_items[userdefined_node.thread_id][
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+        self.userdefined_thread_items[userdefined_node.thread_id][
+            userdefined_node.name].add_item(userdefined_node)
+
+    def add_memory_manipulation_item(self, memory_manipulation_node):
+        if memory_manipulation_node.name not in self.memory_manipulation_items:
+            self.memory_manipulation_items[
+                memory_manipulation_node.name] = EventSummary.GeneralItem(
+                    memory_manipulation_node.name)
+        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
+            memory_manipulation_node)
+
+    def add_model_perspective_item(self, model_perspective_node):
+        if model_perspective_node.type == TracerEventType.Forward:
+            name = 'Forward'
+        elif model_perspective_node.type == TracerEventType.Backward:
+            name = 'Backward'
+        elif model_perspective_node.type == TracerEventType.Optimization:
+            name = 'Optimization'
+        elif model_perspective_node.type == TracerEventType.Dataloader:
+            name = 'Dataloader'
+        else:
+            return
+        if name not in self.model_perspective_items:
+            self.model_perspective_items[name] = EventSummary.GeneralItem(name)
+        self.model_perspective_items[name].add_item(model_perspective_node)
+
+
+class StatisticData:
+    r"""
+    Hold all analysed results.
+    """
+
+    def __init__(self, node_trees, extra_info):
+        self.node_trees = node_trees
+        self.extra_info = extra_info
+        self.time_range_summary = TimeRangeSummary()
+        self.event_summary = EventSummary()
+        self.time_range_summary.parse(node_trees)
+        self.event_summary.parse(node_trees)
+
+
+def _build_table(statistic_data,
+                 sorted_by=SortedKeys.CPUTotal,
+                 op_detail=True,
+                 thread_sep=False,
+                 time_unit='ms',
+                 row_limit=100,
+                 max_src_column_width=75):
+    """Prints a summary of events."""
+    # format table row
+    SPACING_SIZE = 2
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    def add_column(padding, text_dir='<'):
+        row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + (
+            ' ' * SPACING_SIZE)
+        header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
+        line_length_list[0] += padding + SPACING_SIZE
+
+    def add_title(padding, text):
+        left_length = padding - len(text)
+        half = left_length // 2
+        return '-' * half + text + '-' * (left_length - half)
+
+    result = []
+
+    def append(s):
+        result.append(s)
+        result.append('\n')
+
+    def format_time(time, unit='ms', indent=0):
+        r"""
+        Transform time in ns to time in unit.
+        """
+        if time == float('inf'):
+            return '-'
+        else:
+            result = float(time)
+            if unit == 's':
+                result /= 1e9
+            elif unit == 'ms':
+                result /= 1e6
+            elif unit == 'us':
+                result /= 1e3
+            return '{}{:.2f}'.format(' ' * indent, result)
+
+    def format_ratio(ratio, indent=0):
+        r"""
+        Transform ratio within [0, 1] to percentage presentation.
+        """
+        return '{}{:.2f}'.format(' ' * indent, ratio * 100)
+
+    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
+        TracerEventType.ProfileStep)
+    ###### Print Device Summary ######
+    headers = ['Device', 'Utilization (%)']
+    name_column_width = 30
+    DEFAULT_COLUMN_WIDTH = 20
+    add_column(name_column_width)
+    for _ in headers[1:]:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+
+    append(add_title(line_length, "Device Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'CPU(Process)', format_ratio(
+            float(statistic_data.extra_info['Process Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    row_values = [
+        'CPU(System)', format_ratio(
+            float(statistic_data.extra_info['System Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
+        gpu_time = float(
+            statistic_data.time_range_summary.get_gpu_range_sum(
+                gpu_name, TracerEventType.Kernel))
+        utilization = gpu_time / total_time
+        row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
+        "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
+        "GPU Utilization = Current process GPU time / elapsed time")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    if total_time == 0:
+        return ''.join(result)
+
+    ###### Print Overview Summary ######
+    headers = ['Event Type', 'CPU Time', 'Ratio (%)']
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    DEFAULT_COLUMN_WIDTH = 25
+    for _ in headers:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+    append(add_title(line_length, "Overview Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'Total Time', format_time(
+            total_time, unit=time_unit), format_ratio(1)
+    ]
+    append(row_format.format(*row_values))
+    cpu_type_time = collections.defaultdict(int)
+    gpu_type_time = collections.defaultdict(int)
+    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
+    ):
+        cpu_type_time[event_type] = value
+
+    gpu_time_range = collections.defaultdict(list)
+    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+    ):
+        for event_type, time_range in device_time_ranges.items():
+            gpu_time_range[event_type] = merge_ranges(
+                gpu_time_range[event_type], time_range, is_sorted=True)
+    for event_type, time_range in gpu_time_range.items():
+        gpu_type_time[event_type] = sum_ranges(time_range)
+
+    sorted_items = sorted(
+        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    for event_type, time in sorted_items:
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+    append(header_sep)
+    headers = ['', 'GPU Time', 'Ratio (%)']
+    append(row_format.format(*headers))
+    append(header_sep)
+    for event_type, time in gpu_type_time.items():
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
+        "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
+        "ratio = CPU(GPU) Time / Total Time."
+        "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
+        "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
+        "Example:\n"
+        "Thread 1:\n"
+        "  Operator: |___________|     |__________|\n"
+        "Thread 2:\n"
+        "  Operator:   |____________|     |___|\n"
+        "After merged:\n"
+        "  Result:   |______________|  |__________|\n")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    ###### Print Operator Summary Report ######
+    if statistic_data.event_summary.items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 50
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Operator Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            thread_items = statistic_data.event_summary.thread_items
+        else:
+            thread_items = {
+                'All threads merged': statistic_data.event_summary.items
+            }
+        for thread_id, items in thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            total_cpu_time = 0
+            total_gpu_time = 0
+            for name, item in sorted_items:
+                total_cpu_time += item.cpu_time
+                total_gpu_time += item.gpu_time
+            for name, item in sorted_items:
+                row_values = [
+                    name, item.call, '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                ]
+                append(row_format.format(*row_values))
+                if op_detail:
+                    for innerop_name, innerop_node in item.operator_inners.items(
+                    ):
+                        row_values = [
+                            '  {}'.format(innerop_name), innerop_node.call,
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.cpu_time) /
+                                    total_cpu_time)),
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+                        for device_node_name, devicenode in innerop_node.devices.items(
+                        ):
+                            if len(device_node_name) + 4 > name_column_width:
+                                device_node_name = device_node_name[:
+                                                                    name_column_width
+                                                                    - 7]
+                                device_node_name += "..."
+                            row_values = [
+                                '    {}'.format(device_node_name),
+                                devicenode.call, '- / - / - / - / -',
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(
+                                        devicenode.gpu_time, unit=time_unit),
+                                    format_time(
+                                        devicenode.avg_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.max_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.min_gpu_time,
+                                        unit=time_unit),
+                                    format_ratio(
+                                        float(devicenode.gpu_time) /
+                                        total_gpu_time))
+                            ]
+                            append(row_format.format(*row_values))
+                    for device_node_name, device_node in item.devices.items():
+                        if len(device_node_name) + 2 > name_column_width:
+                            device_node_name = device_node_name[:
+                                                                name_column_width
+                                                                - 5]
+                            device_node_name += "..."
+                        row_values = [
+                            '    {}'.format(device_node_name), devicenode.call,
+                            '- / - / - / - / -',
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    devicenode.gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(devicenode.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    return ''.join(result)
diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f11649928a7fe72b0a4ae6f368c3c92debf060c
--- /dev/null
+++ b/python/paddle/profiler/statistic_helper.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+
+def sum_ranges(ranges):
+    result = 0
+    for time_range in ranges:
+        result += (time_range[1] - time_range[0])
+    return result
+
+
+def merge_self_ranges(src_ranges, is_sorted=False):
+    merged_ranges = []
+    if len(src_ranges) > 0:
+        if not is_sorted:
+            src_ranges.sort(key=lambda x: x[0])
+        cur_indx = 0
+        merged_ranges.append((src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+        for cur_indx in range(1, len(src_ranges)):
+            if src_ranges[cur_indx][1] > merged_ranges[-1][1]:
+                if src_ranges[cur_indx][0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0],
+                                         src_ranges[cur_indx][1])
+                else:
+                    merged_ranges.append(
+                        (src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+    return merged_ranges
+
+
+def merge_ranges(range_list1, range_list2, is_sorted=False):
+    merged_ranges = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    if len1 == 0 and len2 == 0:
+        return merged_ranges
+    elif len1 == 0:
+        return range_list2
+    elif len2 == 0:
+        return range_list1
+    else:
+        indx1 = 0
+        indx2 = 0
+        range1 = range_list1[indx1]
+        range2 = range_list2[indx2]
+        if range1[0] < range2[0]:
+            merged_ranges.append(range1)
+            indx1 += 1
+        else:
+            merged_ranges.append(range2)
+            indx2 += 1
+        while indx1 < len1 and indx2 < len2:
+            range1 = range_list1[indx1]
+            range2 = range_list2[indx2]
+            if range1[0] < range2[0]:
+                if range1[1] > merged_ranges[-1][1]:
+                    if range1[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                    else:
+                        merged_ranges.append((range1[0], range1[1]))
+                    indx1 += 1
+                else:
+                    indx1 += 1
+            else:
+                if range2[1] > merged_ranges[-1][1]:
+                    if range2[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                    else:
+                        merged_ranges.append((range2[0], range2[1]))
+                    indx2 += 1
+                else:
+                    indx2 += 1
+
+        while indx1 < len1:
+            range1 = range_list1[indx1]
+            if range1[1] > merged_ranges[-1][1]:
+                if range1[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                else:
+                    merged_ranges.append((range1[0], range1[1]))
+                indx1 += 1
+            else:
+                indx1 += 1
+        while indx2 < len2:
+            if range2[1] > merged_ranges[-1][1]:
+                if range2[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                else:
+                    merged_ranges.append((range2[0], range2[1]))
+                indx2 += 1
+            else:
+                indx2 += 1
+    return merged_ranges
+
+
+def intersection_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if len(range_list1) == 0 or len(range_list2) == 0:
+        return result_range
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+    while indx1 < len1 and indx2 < len2:
+        if range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            assert (range2[1] > range1[0])
+            result_range.append((range1[0], range2[1]))
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            result_range.append(range1)
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        elif range2[1] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append(range2)
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] < range1[1]:
+            assert (range2[1] >= range1[1])
+            result_range.append((range2[0], range1[1]))
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        else:
+            assert (range2[0] >= range1[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+    return result_range
+
+
+def subtract_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    if len(range_list1) == 0:
+        return result_range
+    if len(range_list2) == 0:
+        return range_list1
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+
+    while indx1 < len(range_list1):
+        if indx2 == len(range_list2):
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+        elif range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+        elif range2[0] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append((range1[0], range2[0]))
+            range1 = (range2[0], range1[1])
+        else:
+            assert (range2[0] >= range1[1])
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+    return result_range
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..642001dfbfc5a307d5064860136034ba7b3bdbc5
--- /dev/null
+++ b/python/paddle/profiler/utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.core import (_RecordEvent, TracerEventType,
+                               load_profiler_result)
+from typing import Any
+from warnings import warn
+import functools
+from contextlib import ContextDecorator
+
+_AllowedEventTypeList = [
+    TracerEventType.Dataloader, TracerEventType.ProfileStep,
+    TracerEventType.UserDefined, TracerEventType.Forward,
+    TracerEventType.Backward, TracerEventType.Optimization,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+
+class RecordEvent(ContextDecorator):
+    r"""
+    Interface for recording a time range.
+
+    Parameters:
+    name(str): Name of the record event
+    event_type(TracerEventType): Type of the record event, can be used for statistics.
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
+            op1()
+    """
+
+    def __init__(self,
+                 name: str,
+                 event_type: TracerEventType=TracerEventType.UserDefined):
+        self.name = name
+        self.event_type = event_type
+        self.event = None
+
+    def __enter__(self):
+        self.begin()
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
+        self.end()
+
+    def begin(self):
+        if self.event_type not in _AllowedEventTypeList:
+            warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
+                  can be recorded.".format(*_AllowedEventTypeList))
+            self.event = None
+        else:
+            if self.event_type == TracerEventType.UserDefined:
+                self.event_type == TracerEventType.PythonUserDefined
+            self.event = _RecordEvent(self.name, self.event_type)
+
+    def end(self):
+        if self.event:
+            self.event.end()
+
+
+def wrap_optimizers():
+    def optimizer_warpper(func):
+        @functools.wraps(func)
+        def warpper(*args, **kwargs):
+            with RecordEvent(
+                    'Optimization Step',
+                    event_type=TracerEventType.Optimization):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    import paddle.optimizer as optimizer
+    for classname in optimizer.__all__:
+        if classname != 'Optimizer':
+            classobject = getattr(optimizer, classname)
+            if getattr(classobject, 'step', None) != None:
+                classobject.step = optimizer_warpper(classobject.step)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index bddc45bc9612c34c4fab1b6c4ce6ae1fc47a1052..6555ba0812d08c0ca3a21641b5b28d5a3763f2c4 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -283,7 +283,7 @@ def ones_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
@@ -358,7 +358,7 @@ def zeros_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index fbd6197c1b92ee8481a1ce6f4a2cec8482eaefb0..32ccecbc6d9f0282b86f100e1b910667fab41cb2 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None):
         return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
 
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        x, 'x',
+        ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'gather')
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ce29e9dce81809a9745d6efbe6da419878423e00..9a0139105651b53781f9c76189abb1b7d8ddefe9 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -27,7 +27,7 @@ from paddle.tensor import cast
 from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
 from paddle.static import Variable
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
@@ -1083,6 +1083,8 @@ def trunc(input, name=None):
             #         [0., 0.]]))
     '''
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return  _C_ops.final_state_trunc(input)
         return _C_ops.trunc(input)
     else:
         inputs = {"X": input}
@@ -2425,6 +2427,8 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_diagonal(x, offset, axis1, axis2)
         return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     def __check_input(input, offset, dim1, dim2):
@@ -3184,6 +3188,8 @@ def digamma(x, name=None):
     """
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_digamma(x)
         return _C_ops.digamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma')
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 468aa460486275f78d240bdee40b9d73f07dbcda..dd0da03e4fd2816bcd28ea76cc4a0712451c3e39 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -321,6 +321,9 @@ def median(x, axis=None, keepdim=False, name=None):
             paddle.slice(
                 tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]),
             dtype=dtype)
+    out_tensor = out_tensor + paddle.sum(
+        paddle.cast(
+            paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
     if not keepdim or is_flatten:
         if not is_flatten:
             newshape = x.shape[:axis] + x.shape[axis + 1:]
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index bbde64f2e609cd511ca17b3a3c3e8d11712c84a5..4441faee14e021ab4c6d45c6bad36653e79db37e 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -61,6 +61,8 @@ class TestPretrainedModel(unittest.TestCase):
         arches = [
             'mobilenet_v1',
             'mobilenet_v2',
+            'mobilenet_v3_small',
+            'mobilenet_v3_large',
             'squeezenet1_0',
             'shufflenet_v2_x0_25',
         ]
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 547c53345995c28c0f4b4fbdfa9ae17477f09864..dc98fc3219bff6d2ae5e65a4ad21e4303baba8c7 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -40,6 +40,12 @@ class TestVisonModels(unittest.TestCase):
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
 
+    def test_mobilenetv3_small(self):
+        self.models_infer('mobilenet_v3_small')
+
+    def test_mobilenetv3_large(self):
+        self.models_infer('mobilenet_v3_large')
+
     def test_vgg11(self):
         self.models_infer('vgg11')
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 45a6aae5e6dddd89509c120ffbb67540669f796a..d24b64bf661778618a93b867a296b0b9c23c5fe8 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -121,10 +121,10 @@
   backward : matmul_grad
 
 - api : mean
-  args : (Tensor x, int64_t[] axis={}, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
-    func : MeanInferMeta
+    func : ReduceInferMeta
   kernel :
     func : mean
 
@@ -150,6 +150,15 @@
     func : reshape
   inplace : (x -> out)
 
+- api : relu
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : relu
+  inplace : (x -> out)
+
 - api : scale
   args : (Tensor x, Scalar scale, float bias, bool bias_after_scale)
   output : Tensor
@@ -158,6 +167,7 @@
     param : [x]
   kernel :
     func : scale, scale_sr
+  inplace : (x -> out)
 
 - api : sign
   args : (Tensor x)
@@ -167,6 +177,14 @@
   kernel :
     func : sign
 
+- api : softmax
+  args : (Tensor x, int axis)
+  output : Tensor
+  infer_meta :
+    func : SoftmaxInferMeta
+  kernel :
+    func : sotfmax
+
 - api : split
   args : (Tensor x, ScalarArray num_or_sections, Scalar axis)
   output : Tensor[]
@@ -181,7 +199,7 @@
     func : subtract
 
 - api : sum
-  args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : SumInferMeta
@@ -193,3 +211,58 @@
   args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
+
+
+- api : one_hot
+  args : (Tensor x, Scalar num_classes)
+  output : Tensor
+  infer_meta :
+    func : OneHotInferMeta
+  kernel :
+    func : one_hot
+    
+- api : digamma
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : digamma
+  backward : digamma_grad
+
+- api : abs
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : abs
+  backward : abs_grad
+
+- api : trunc
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : trunc
+  backward : trunc_grad
+
+# - api : norm
+#   args : (Tensor x, int axis, float epsilon, bool is_test)
+#   output : Tensor(out), Tensor(norm)
+#   infer_meta :
+#     func : NormInferMeta
+#   kernel :
+#     func : norm
+#   intermediate : norm
+#   backward : norm_grad
+
+- api : diagonal
+  args : (Tensor x, int offset, int axis1, int axis2)
+  output : Tensor
+  infer_meta :
+    func : DiagonalInferMeta
+  kernel :
+    func : diagonal
+  backward : diagonal_grad
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index cfd817c24c7367f69673353a8aaceeedec506e15..d91b76bb70314a2d516b8a384cf3406b7f9e4d0d 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -43,7 +43,9 @@ class BaseAPI(object):
             self.is_base_api = False
             self.invoke = api_item_yaml['invoke']
         else:
-            self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta'])
+            if 'infer_meta' in api_item_yaml:
+                self.infer_meta = self.parse_infer_meta(api_item_yaml[
+                    'infer_meta'])
             self.kernel = self.parse_kernel(api_item_yaml['kernel'])
             self.support_selected_rows_kernel = False if len(self.kernel[
                 'func']) == 1 else True
@@ -87,18 +89,20 @@ class BaseAPI(object):
         attr_types_map = {
             'ScalarArray': 'const ScalarArray&',
             'Scalar': 'const Scalar&',
+            'uint8': 'uint8_t',
             'int': 'int',
-            'int32_t': 'int32_t',
-            'int64_t': 'int64_t',
+            'int32': 'int32_t',
+            'int64': 'int64_t',
             'long': 'long',
             'size_t': 'size_t',
             'float': 'float',
             'double': 'double',
             'bool': 'bool',
+            'str': 'const std::string&',
             'Backend': 'Backend',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
-            'int64_t[]': 'const std::vector<int64_t>&',
+            'int64[]': 'const std::vector<int64_t>&',
             'int[]': 'const std::vector<int>&',
             'long[]': 'const std::vector<int64_t>&'
         }
@@ -108,8 +112,8 @@ class BaseAPI(object):
             'ScalarArray': 'const paddle::optional<ScalarArray>&',
             'Scalar': 'const paddle::optional<Scalar>&',
             'int': 'paddle::optional<int>',
-            'int32_t': 'paddle::optional<int32_t>',
-            'int64_t': 'paddle::optional<int64_t>',
+            'int32': 'paddle::optional<int32_t>',
+            'int64': 'paddle::optional<int64_t>',
             'size_t': 'paddle::optional<size_t>',
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
@@ -117,7 +121,7 @@ class BaseAPI(object):
             'Backend': 'paddle::optional<Backend>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>',
-            'int64_t[]': 'paddle::optional<std::vector<int64_t>>',
+            'int64[]': 'paddle::optional<std::vector<int64_t>>',
             'int[]': 'paddle::optional<std::vector<int>>'
         }
 
@@ -182,9 +186,9 @@ class BaseAPI(object):
                 'Tensor': 'Tensor',
                 'Tensor[]': 'std::vector<Tensor>'
             }
-            if re.search(r'\(\w*\)', output_item):
+            if re.search(r'\([a-zA-Z0-9_@]*\)', output_item):
                 result = re.search(
-                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)",
+                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>[a-zA-Z0-9_@]+)\)",
                     output_item)
                 out_type = result.group('out_type')
                 assert out_type in output_type_map, \
@@ -297,12 +301,12 @@ class BaseAPI(object):
 
     def gene_api_declaration(self):
         api_declaration = f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
 """
 
         if self.is_base_api and self.inplace_map is not None:
             api_declaration = api_declaration + f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
 """
 
         return api_declaration
@@ -456,7 +460,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                 elif self.inputs['input_info'][
                         param] == "const std::vector<Tensor>&":
                     meta_tensor_code = meta_tensor_code + f"""
-{code_indent}  auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param});
+{code_indent}  auto {param}_meta_vec = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
 {code_indent}  std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
 {code_indent}  for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
 {code_indent}    {param}_metas[i] = &{param}_meta_vec[i];
@@ -499,11 +503,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::DenseTensor&',
-            'const Tensor &': 'const phi::DenseTensor&',
             'const std::vector<Tensor>&':
-            'const std::vector<phi::DenseTensor>&',
-            'const std::vector<Tensor> &':
-            'const std::vector<phi::DenseTensor>&',
+            'const std::vector<const phi::DenseTensor*>&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::DenseTensor&>',
             'const paddle::optional<std::vector<Tensor>>&':
@@ -540,9 +541,22 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 {code_indent}  }}"""
 
                 else:
-                    input_tensor_code = input_tensor_code + f"""
+                    if self.inputs['input_info'][input_name] == "const Tensor&":
+                        input_tensor_code = input_tensor_code + f"""
 {code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        input_tensor_code = input_tensor_code + f"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
+{code_indent}  std::vector<const phi::DenseTensor*> {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size());
+{code_indent}  for (size_t i = 0; i < {PREFIX_TENSOR_NAME}{input_name}.size(); ++i) {{
+{code_indent}    {PREFIX_TENSOR_NAME}{input_name}[i] = &{PREFIX_TENSOR_NAME}{input_name}_vec->at(i);
+{code_indent}  }}"""
+
+                    else:
+                        # do nothing
+                        pass
             else:
                 if input_name in self.optional_vars:
                     input_tensor_code = input_tensor_code + f"""
@@ -562,7 +576,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
                 if param in self.optional_vars:
                     kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
                 else:
-                    kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    if self.inputs['input_info'][param] == "const Tensor&":
+                        kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+                    elif self.inputs['input_info'][
+                            input_name] == "const std::vector<Tensor>&":
+                        kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
+                    else:
+                        # do nothing
+                        pass
                 kernel_in_type = input_trans_map[input_infos[param]]
                 kernel_args_type_list.append(kernel_in_type)
             elif param in attr_names:
@@ -592,7 +613,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
     def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
-            'const Tensor &': 'const phi::SelectedRows&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::SelectedRows&>'
         }
@@ -655,6 +675,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
         return input_tensor_code, kernel_args[:-2], kernel_signature
 
+    # Override by child class
+    def gene_return_type_code(self):
+        return self.outputs['return_type']
+
+    # Override by child class
+    def gene_return_code(self):
+        return "api_output"
+
     # Override by child class
     def gene_output(self,
                     output_type_list,
@@ -668,6 +696,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
             code_indent)
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
 {code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -681,9 +710,12 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -691,6 +723,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetSelectedRowsKernelOutput', code_indent,
             inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
 {code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][1]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -704,14 +737,17 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         api_code = f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
 {self.gene_kernel_select()}
 """
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index a26630ad04100fbebdb7c270b83912bb722040d4..98a3606952bbb13d3b20c55427b9747f1a4a5624 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -23,7 +23,8 @@ from api_base import BaseAPI
 class ForwardAPI(BaseAPI):
     def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
-        self.is_dygraph_api = self.parse_intermediate(api_item_yaml)
+        self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -33,15 +34,47 @@ class ForwardAPI(BaseAPI):
 
     def parse_intermediate(self, api_item_yaml):
         if 'intermediate' in api_item_yaml:
-            return True
+            intermediate_outs = [
+                item.strip()
+                for item in api_item_yaml['intermediate'].split(',')
+            ]
+            return True, intermediate_outs
         else:
-            return False
+            return False, []
 
     def get_return_type(self, out_type_list):
         return out_type_list[0] if len(
             out_type_list) == 1 else "std::tuple<" + ",".join(
                 out_type_list) + ">"
 
+    def gene_return_type_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return self.outputs['return_type']
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(self.outputs['types'][i])
+            return return_out_list[0] if len(
+                return_out_list) == 1 else "std::tuple<" + ",".join(
+                    return_out_list) + ">"
+
+    def gene_return_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return "api_output"
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(i)
+            if len(return_out_list) == 1:
+                return f"std::get<{return_out_list[0]}>(api_output)"
+            else:
+                selected_code = [
+                    f"std::get<{i}>(api_output)" for i in return_out_list
+                ]
+            return '{' + ", ".join(selected_code) + '}'
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -58,12 +91,12 @@ class ForwardAPI(BaseAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out;"""
+{code_indent}  {self.outputs['return_type']} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -71,10 +104,10 @@ class ForwardAPI(BaseAPI):
                 if inplace_flag and self.inplace_map is not None and self.outputs[
                         'names'][i] in self.inplace_map:
                     output_create = output_create + f"""
-{code_indent}  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(out));"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(api_output));"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -105,7 +138,7 @@ def source_include(header_file_path):
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
@@ -115,6 +148,8 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/declarations.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
@@ -169,6 +204,10 @@ def generate_api(api_yaml_path, header_file_path, source_file_path,
         if foward_api.is_dygraph_api:
             dygraph_header_file.write(foward_api.gene_api_declaration())
             dygraph_source_file.write(foward_api.gene_api_code())
+
+            foward_api.is_dygraph_api = False
+            header_file.write(foward_api.gene_api_declaration())
+            source_file.write(foward_api.gene_api_code())
         else:
             header_file.write(foward_api.gene_api_declaration())
             source_file.write(foward_api.gene_api_code())
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index cdda5cb1f05e84f05f468837dae1f59116fa293f..c69bbf35b97263fb2c153839ac0105427a87e118 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -25,6 +25,61 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
+- backward_api : digamma_grad
+  forward : digamma (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : digamma_grad
+
+- backward_api : abs_grad
+  forward : abs (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : abs_grad
+
+- backward_api : trunc_grad
+  forward : trunc (Tensor x) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : trunc_grad
+
+# - backward_api : norm_grad
+#   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
+#   args : (Tensor out_grad, Tensor x, Tensor norm, int axis, float epsilon, bool is_test)
+#   output : Tensor(x_grad)
+#   infer_meta :
+#     func : UnchangedInferMeta
+#     param : [x]
+#   kernel :
+#     func : norm_grad
+
+- backward_api : diagonal_grad
+  forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : diagonal_grad
+
+# - backward_api : split_grad
+#   forward : split (Tensor x, ScalarArray num_or_sections, Scalar axis) -> Tensor[](out)
+#   args : (Tensor[] out_grad, Scalar axis)
+#   output : Tensor(x_grad)    
+#   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
 # - backward_api : matmul_triple_grad
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 2d33cd5b1812ada8fca118c0e0f616cfbe511dd1..5506f71f4b671da282fb933436b3c17d4a47a8fb 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -35,6 +35,7 @@ class BackwardAPI(BaseAPI):
             forward_config)
         api = result.group('api')
         _, outputs, _ = self.parse_output(self.api, result.group('outputs'))
+        outputs = [item.split('@')[0] for item in outputs]
         fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
             api, result.group('args'))
 
@@ -56,8 +57,9 @@ class BackwardAPI(BaseAPI):
 
         # check the attributes of backward
         for attr in self.attrs['names']:
-            assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
-                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+            assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
+                 self.attrs['attr_info'][attr][1] is not None, \
+                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
                  Please check the args of {self.api} in yaml."
 
         # check the output of backward
@@ -85,33 +87,33 @@ class BackwardAPI(BaseAPI):
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out({len(output_type_list)});"""
+{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
                 if out_type_item == 'Tensor':
-                    get_out_code = f'&out[{i}][0]'
+                    get_out_code = f'&api_output[{i}][0]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
 
                     else:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back();"""
+{code_indent}  api_output[{i}].emplace_back();"""
 
                 else:
-                    get_out_code = f'&out[{i}]'
+                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
 {code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {get_out_code});"""
@@ -145,13 +147,15 @@ def source_include(header_file_path):
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c859022e8ad1d910ecf44426b2850496e793cee
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -0,0 +1,21 @@
+- api : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  kernel :
+    func : sparse_conv3d
+    layout : x
+
+- api : to_dense
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@DenseTensor)
+  invoke : to_dense_impl(x, backend)
+
+- api : to_sparse_coo
+  args : (Tensor x, Backend backend, int64 sparse_dim)
+  output : Tensor(out@SparseCooTensor)
+  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+
+- api : to_sparse_csr
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@SparseCsrTensor)
+  invoke : to_sparse_csr_impl(x, backend)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd22e16dc64f0a12c3575b6f2a0d2c21cd97955b
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from api_gen import ForwardAPI
+
+
+class SparseAPI(ForwardAPI):
+    def __init__(self, api_item_yaml):
+        super(SparseAPI, self).__init__(api_item_yaml)
+
+    def get_api_func_name(self):
+        return self.api
+
+    def gene_api_declaration(self):
+        return f"""
+// {", ".join(self.outputs['names'])}
+PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+"""
+
+    def get_kernel_tensor_out_type(self, output_name):
+        sparse_type = 'TensorType::DENSE_TENSOR'
+        if output_name.endswith('@SparseCooTensor'):
+            sparse_type = 'TensorType::SPARSE_COO'
+        elif output_name.endswith('@SparseCsrTensor'):
+            sparse_type = 'TensorType::SPARSE_CSR'
+        return sparse_type
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} api_output;"""
+
+            for i in range(len(output_type_list)):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+    def gen_sparse_kernel_context(self, kernel_output_names):
+        input_trans_map = {
+            'const Tensor&': 'const phi::TenseBase&',
+            'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::TenseBase&>'
+        }
+        out_trans_map = {
+            'Tensor': 'phi::TenseBase*',
+            'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
+        }
+        input_names = self.inputs['names']
+        input_infos = self.inputs['input_info']
+
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        kernel_context_code = ""
+        for param in kernel_param:
+            if param in input_names:
+                if param in self.optional_vars:
+                    raise ValueError(
+                        f"{self.api} : Unsupport optional input({param}) for sparse api."
+                    )
+                else:
+                    kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackInput({param}.impl().get());"""
+
+                continue
+            if param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::ScalarArray(' + param + ')'
+                elif 'Scalar' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::Scalar(' + param + ')'
+            elif isinstance(param, bool):
+                param = str(param).lower()
+            else:
+                param + str(param) + ", "
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+        for out_name in kernel_output_names:
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackOutput({out_name});"""
+
+        return kernel_context_code
+
+    def gen_sparse_kernel_code(self, inplace_flag=False):
+        _, kernel_output_names, output_create = self.gene_output(
+            self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
+
+        kernel_context_code = self.gen_sparse_kernel_context(
+            kernel_output_names)
+
+        return f"""
+  auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = phi::KernelContext(dev_ctx);
+{output_create}
+{kernel_context_code}
+  phi_kernel(&kernel_context);
+
+  return api_output;"""
+
+    def gene_base_api_code(self, inplace_flag=False):
+        api_func_name = self.get_api_func_name()
+        return f"""
+PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+{self.gene_kernel_select()}
+{self.gen_sparse_kernel_code(inplace_flag)}
+}}
+"""
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/data_transform.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/include/sparse_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_api = SparseAPI(api)
+        header_file.write(sparse_api.gene_api_declaration())
+        source_file.write(sparse_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/include/sparse_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6532f103cbf86288ffc739656440dc378d48eb2d
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -0,0 +1,6 @@
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
+  kernel :
+    func : sparse_conv3d_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..561e198a41b99c2362f5e14ac7fcd6da051c8875
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from sparse_api_gen import SparseAPI
+from backward_api_gen import BackwardAPI
+
+
+class SparseBackwardAPI(SparseAPI, BackwardAPI):
+    def __init__(self, bw_api_item_yaml):
+        BackwardAPI.__init__(self, bw_api_item_yaml)
+
+    def get_api_func_name(self):
+        return self.api
+
+    def get_return_type(self, out_type_list):
+        return BackwardAPI.get_return_type(self, out_type_list)
+
+    def gene_api_declaration(self):
+        return SparseAPI.gene_api_declaration(self)
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} api_output({len(output_type_list)});"""
+
+            for i, out_type_item in enumerate(output_type_list):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if out_type_item == 'Tensor':
+                    get_out_code = f'&api_output[{i}][0]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+
+                    else:
+                        output_create = output_create + f"""
+  api_output[{i}].emplace_back();"""
+
+                else:
+                    get_out_code = f'&api_output[{i}]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+
+def header_include():
+    return """
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/backward/sparse_bw_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_bw_api = SparseBackwardAPI(api)
+        header_file.write(sparse_bw_api.gene_api_declaration())
+        source_file.write(sparse_bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_bw_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/backward/sparse_bw_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_bw_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 853a98a62b504d94617127bd35212d2412719e1c..b0a5d37a535df7e83c08f18e624402294bf29539 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -146,6 +146,9 @@ def custom_write_stub(resource, pyfile):
         import types
         import paddle
         
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        so_path = os.path.join(cur_dir, "{resource}")
+        
         def inject_ext_module(module_name, api_names):
             if module_name in sys.modules:
                 return sys.modules[module_name]
@@ -157,9 +160,6 @@ def custom_write_stub(resource, pyfile):
             return new_module
 
         def __bootstrap__():
-            cur_dir = os.path.dirname(os.path.abspath(__file__))
-            so_path = os.path.join(cur_dir, "{resource}")
-
             assert os.path.exists(so_path)
 
             # load custom op shared library with abs path
@@ -169,6 +169,7 @@ def custom_write_stub(resource, pyfile):
         __bootstrap__()
 
         {custom_api}
+
         """).lstrip()
 
     # Parse registerring op information
@@ -900,7 +901,7 @@ def _generate_python_module(module_name,
     # delete the temp file before exit python process    
     atexit.register(lambda: remove_if_exit(api_file))
 
-    # write into .py file with RWLock
+    # write into .py file with RWLockc
     api_content = [_custom_api_content(op_name) for op_name in op_names]
     with open(api_file, 'w') as f:
         f.write('\n\n'.join(api_content))
@@ -911,13 +912,15 @@ def _generate_python_module(module_name,
 
 
 def _custom_api_content(op_name):
-    params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name)
-
+    params_str, ins_str, attrs_str, outs_str, in_names, attrs_names = _get_api_inputs_str(
+        op_name)
+    lower_in_names = [p.split("@")[0].lower() for p in in_names]
     API_TEMPLATE = textwrap.dedent("""
-        from paddle.fluid.core import VarBase
-        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer
+        import paddle.fluid.core as core
+        from paddle.fluid.core import VarBase, CustomOpKernelContext
+        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer, _in_eager_mode
         from paddle.fluid.layer_helper import LayerHelper
-
+        
         def {op_name}({inputs}):
             # prepare inputs and outputs
             ins = {ins}
@@ -928,9 +931,20 @@ def _custom_api_content(op_name):
             # The output variable's dtype use default value 'float32',
             # and the actual dtype of output variable will be inferred in runtime.
             if in_dygraph_mode():
-                for out_name in out_names:
-                    outs[out_name] = VarBase()
-                _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+                if _in_eager_mode():
+                    ctx = CustomOpKernelContext()
+                    for i in {in_names}:
+                        ctx.add_inputs(i)
+                    for j in {attr_names}:
+                        ctx.add_attr(j)
+                    for out_name in out_names:
+                        outs[out_name] = core.eager.Tensor()
+                        ctx.add_outputs(outs[out_name])
+                    core.eager._run_custom_op(ctx, "{op_name}", True)
+                else:
+                    for out_name in out_names:
+                        outs[out_name] = VarBase()
+                    _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
             else:
                 helper = LayerHelper("{op_name}", **locals())
                 for out_name in out_names:
@@ -949,6 +963,9 @@ def _custom_api_content(op_name):
         inputs=params_str,
         ins=ins_str,
         attrs=attrs_str,
+        # "[x, y, z]""
+        in_names="[" + ",".join(lower_in_names) + "]",
+        attr_names="[" + ",".join(attrs_names) + "]",
         out_names=outs_str)
 
     return api_content
@@ -996,7 +1013,7 @@ def _get_api_inputs_str(op_name):
     ])
     # e.g: ['Out', 'Index']
     outs_str = "[%s]" % ','.join(["'{}'".format(name) for name in out_names])
-    return params_str, ins_str, attrs_str, outs_str
+    return params_str, ins_str, attrs_str, outs_str, in_names, attr_names
 
 
 def _write_setup_file(name,
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index 37520175a719f7aedabf23e89b4436bb04469bd7..3749e0f64fc6a83433dd0533cc91736503a7495b 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -40,6 +40,10 @@ from .models import MobileNetV1  # noqa: F401
 from .models import mobilenet_v1  # noqa: F401
 from .models import MobileNetV2  # noqa: F401
 from .models import mobilenet_v2  # noqa: F401
+from .models import MobileNetV3Small  # noqa: F401
+from .models import MobileNetV3Large  # noqa: F401
+from .models import mobilenet_v3_small  # noqa: F401
+from .models import mobilenet_v3_large  # noqa: F401
 from .models import SqueezeNet  # noqa: F401
 from .models import squeezenet1_0  # noqa: F401
 from .models import squeezenet1_1  # noqa: F401
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 044be6a42b7c28699747910bd610c86103a32d73..5ff3562e56ea8c2984d14cc9da00bdaccba9017b 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -24,6 +24,10 @@ from .mobilenetv1 import MobileNetV1  # noqa: F401
 from .mobilenetv1 import mobilenet_v1  # noqa: F401
 from .mobilenetv2 import MobileNetV2  # noqa: F401
 from .mobilenetv2 import mobilenet_v2  # noqa: F401
+from .mobilenetv3 import MobileNetV3Small  # noqa: F401
+from .mobilenetv3 import MobileNetV3Large  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_small  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_large  # noqa: F401
 from .vgg import VGG  # noqa: F401
 from .vgg import vgg11  # noqa: F401
 from .vgg import vgg13  # noqa: F401
@@ -79,6 +83,10 @@ __all__ = [ #noqa
     'mobilenet_v1',
     'MobileNetV2',
     'mobilenet_v2',
+    'MobileNetV3Small',
+    'MobileNetV3Large',
+    'mobilenet_v3_small',
+    'mobilenet_v3_large',
     'LeNet',
     'DenseNet',
     'densenet121',
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 74071fc121688eafbf17833a6410b94d34191ec4..6c486037c7d30539eb7996f836028be2109413fd 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle
-
 import paddle.nn as nn
-import paddle.nn.functional as F
-
 from paddle.utils.download import get_weights_path_from_url
 
+from .utils import _make_divisible
+
 __all__ = []
 
 model_urls = {
@@ -29,16 +27,6 @@ model_urls = {
 }
 
 
-def _make_divisible(v, divisor, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
 class ConvBNReLU(nn.Sequential):
     def __init__(self,
                  in_planes,
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7ae010c58f6babbe5a949c642cdbcace4e951c
--- /dev/null
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.utils.download import get_weights_path_from_url
+from functools import partial
+
+from .utils import _make_divisible
+from ..ops import ConvNormActivation
+
+__all__ = []
+
+model_urls = {
+    "mobilenet_v3_small_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_small_x1.0.pdparams",
+     "34fe0e7c1f8b00b2b056ad6788d0590c"),
+    "mobilenet_v3_large_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_large_x1.0.pdparams",
+     "118db5792b4e183b925d8e8e334db3df"),
+}
+
+
+class SqueezeExcitation(nn.Layer):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., paddle.nn.Layer], optional): ``delta`` activation. Default: ``paddle.nn.ReLU``
+        scale_activation (Callable[..., paddle.nn.Layer]): ``sigma`` activation. Default: ``paddle.nn.Sigmoid``
+    """
+
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 activation=nn.ReLU,
+                 scale_activation=nn.Sigmoid):
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+        self.fc1 = nn.Conv2D(input_channels, squeeze_channels, 1)
+        self.fc2 = nn.Conv2D(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input):
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+    def forward(self, input):
+        scale = self._scale(input)
+        return scale * input
+
+
+class InvertedResidualConfig:
+    def __init__(self,
+                 in_channels,
+                 kernel,
+                 expanded_channels,
+                 out_channels,
+                 use_se,
+                 activation,
+                 stride,
+                 scale=1.0):
+        self.in_channels = self.adjust_channels(in_channels, scale=scale)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(
+            expanded_channels, scale=scale)
+        self.out_channels = self.adjust_channels(out_channels, scale=scale)
+        self.use_se = use_se
+        if activation is None:
+            self.activation_layer = None
+        elif activation == "relu":
+            self.activation_layer = nn.ReLU
+        elif activation == "hardswish":
+            self.activation_layer = nn.Hardswish
+        else:
+            raise RuntimeError("The activation function is not supported: {}".
+                               format(activation))
+        self.stride = stride
+
+    @staticmethod
+    def adjust_channels(channels, scale=1.0):
+        return _make_divisible(channels * scale, 8)
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, expanded_channels, out_channels,
+                 filter_size, stride, use_se, activation_layer, norm_layer):
+        super().__init__()
+        self.use_res_connect = stride == 1 and in_channels == out_channels
+        self.use_se = use_se
+        self.expand = in_channels != expanded_channels
+
+        if self.expand:
+            self.expand_conv = ConvNormActivation(
+                in_channels=in_channels,
+                out_channels=expanded_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer)
+
+        self.bottleneck_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            groups=expanded_channels,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer)
+
+        if self.use_se:
+            self.mid_se = SqueezeExcitation(
+                expanded_channels,
+                _make_divisible(expanded_channels // 4),
+                scale_activation=nn.Hardsigmoid)
+
+        self.linear_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_layer=norm_layer,
+            activation_layer=None)
+
+    def forward(self, x):
+        identity = x
+        if self.expand:
+            x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.use_res_connect:
+            x = paddle.add(identity, x)
+        return x
+
+
+class MobileNetV3(nn.Layer):
+    """MobileNetV3 model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        config (list[InvertedResidualConfig]): MobileNetV3 depthwise blocks config.
+        last_channel (int): The number of channels on the penultimate layer.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+    """
+
+    def __init__(self,
+                 config,
+                 last_channel,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True):
+        super().__init__()
+
+        self.config = config
+        self.scale = scale
+        self.last_channel = last_channel
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self.firstconv_in_channels = config[0].in_channels
+        self.lastconv_in_channels = config[-1].in_channels
+        self.lastconv_out_channels = self.lastconv_in_channels * 6
+        norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99)
+
+        self.conv = ConvNormActivation(
+            in_channels=3,
+            out_channels=self.firstconv_in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            activation_layer=nn.Hardswish,
+            norm_layer=norm_layer)
+
+        self.blocks = nn.Sequential(*[
+            InvertedResidual(
+                in_channels=cfg.in_channels,
+                expanded_channels=cfg.expanded_channels,
+                out_channels=cfg.out_channels,
+                filter_size=cfg.kernel,
+                stride=cfg.stride,
+                use_se=cfg.use_se,
+                activation_layer=cfg.activation_layer,
+                norm_layer=norm_layer) for cfg in self.config
+        ])
+
+        self.lastconv = ConvNormActivation(
+            in_channels=self.lastconv_in_channels,
+            out_channels=self.lastconv_out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            norm_layer=norm_layer,
+            activation_layer=nn.Hardswish)
+
+        if with_pool:
+            self.avgpool = nn.AdaptiveAvgPool2D(1)
+
+        if num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(self.lastconv_out_channels, self.last_channel),
+                nn.Hardswish(),
+                nn.Dropout(p=0.2),
+                nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.lastconv(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+
+        return x
+
+
+class MobileNetV3Small(MobileNetV3):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Small
+
+            # build model
+            model = MobileNetV3Small(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, True, "relu", 2, scale),
+            InvertedResidualConfig(16, 3, 72, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 88, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 96, 40, True, "hardswish", 2, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 144, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 288, 96, True, "hardswish", 2, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+        ]
+        last_channel = _make_divisible(1024 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+class MobileNetV3Large(MobileNetV3):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Large
+
+            # build model
+            model = MobileNetV3Large(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, False, "relu", 1, scale),
+            InvertedResidualConfig(16, 3, 64, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 72, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 72, 40, True, "relu", 2, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 3, 240, 80, False, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(80, 3, 200, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 480, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 3, 672, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 5, 672, 160, True, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+        ]
+        last_channel = _make_divisible(1280 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs):
+    if arch == "mobilenet_v3_large":
+        model = MobileNetV3Large(scale=scale, **kwargs)
+    else:
+        model = MobileNetV3Small(scale=scale, **kwargs)
+    if pretrained:
+        arch = "{}_x{}".format(arch, scale)
+        assert (
+            arch in model_urls
+        ), "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
+
+
+def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_small
+
+            # build model
+            model = mobilenet_v3_small()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_small(pretrained=True)
+
+            # build mobilenet v3 small model with scale=0.5
+            model = mobilenet_v3_small(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs)
+    return model
+
+
+def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_large
+
+            # build model
+            model = mobilenet_v3_large()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_large(pretrained=True)
+
+            # build mobilenet v3 large model with scale=0.5
+            model = mobilenet_v3_large(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs)
+    return model
diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f61d0d601a44f5b47b5f36a1488a736f28df298c
--- /dev/null
+++ b/python/paddle/vision/models/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    """
+    This function ensures that all layers have a channel number that is divisible by divisor
+    You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505
+
+    Args:
+        divisor (int): The divisor for number of channels. Default: 8.
+        min_value (int, optional): The minimum value of number of channels, if it is None,
+                the default is divisor. Default: None.
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 03060e92bdb69b1ec6022d887d01c514cb11b45d..b65bfa502c4dfefdc446d9661a6dc6dfbfa0872d 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -17,7 +17,7 @@ from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
 from ..fluid.layers import nn, utils
-from ..nn import Layer
+from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D
 from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
@@ -195,7 +195,7 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode() and gt_score is None:
-        loss = _C_ops.yolov3_loss(
+        loss, _, _ = _C_ops.yolov3_loss(
             x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask,
             'class_num', class_num, 'ignore_thresh', ignore_thresh,
             'downsample_ratio', downsample_ratio, 'use_label_smooth',
@@ -1297,3 +1297,57 @@ class RoIAlign(Layer):
             output_size=self._output_size,
             spatial_scale=self._spatial_scale,
             aligned=aligned)
+
+
+class ConvNormActivation(Sequential):
+    """
+    Configurable block used for Convolution-Normalzation-Activation blocks.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None,
+            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
+            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d``
+        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
+            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=None,
+                 groups=1,
+                 norm_layer=BatchNorm2D,
+                 activation_layer=ReLU,
+                 dilation=1,
+                 bias=None):
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        if bias is None:
+            bias = norm_layer is None
+        layers = [
+            Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=bias)
+        ]
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            layers.append(activation_layer())
+        super().__init__(*layers)
diff --git a/python/setup.py.in b/python/setup.py.in
index ec1b1cbcb9510c80a42dff49fa1a5121a9cb487f..44998bd3e1675f2a3f77edd26c9cd8fa85121b6a 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -280,6 +280,7 @@ packages=['paddle',
           'paddle.incubate.nn',
           'paddle.incubate.passes',
           'paddle.distribution',
+          'paddle.distributed.sharding',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
@@ -299,6 +300,7 @@ packages=['paddle',
           'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.distributed.auto_parallel',
           'paddle.distributed.auto_parallel.operators',
+          'paddle.distributed.auto_parallel.tuner',
           'paddle.distributed.passes',
           'paddle.framework',
           'paddle.jit',
@@ -372,6 +374,7 @@ packages=['paddle',
           'paddle.device',
           'paddle.device.cuda',
           'paddle.version',
+          'paddle.profiler'
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -503,6 +506,18 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_ONNXRUNTIME}' == 'ON':
+    shutil.copy('${ONNXRUNTIME_SHARED_LIB}', libs_path)
+    if os.name == 'nt':
+        shutil.copy('${PADDLE2ONNX_SHARED_LIB}', libs_path)
+        package_data['paddle.libs']+=['paddle2onnx.dll', 'onnxruntime.dll']
+    else:
+        shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
+        if sys.platform == 'darwin':
+            package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
+        else:
+            package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
+
 if '${WITH_XPU}' == 'ON':
     # only change rpath in Release mode,
     if '${CMAKE_BUILD_TYPE}' == 'Release':
@@ -576,11 +591,14 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # phi api
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # phi common headers
     # phi level api headers (low level api)
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi')) +  # phi extension header
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/include', recursive=True)) +  # phi include headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # phi backends headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) +  # phi infermeta headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) +  # phi kernels headers
     # utila api headers
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -625,8 +643,6 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'device_ext.h' in header:
-                install_dir = "paddle/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 55d2d59c7ece6a4639b1227f600a7d208a69f2e7..9c802a56a7b6e29bc89ad164a15f2f6d4749734e 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -198,7 +198,9 @@ if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
+# infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future.
+NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true`
+HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
 if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
     check_approval 1 6836917 47554610 22561442
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 83c758d0aa8b8faf35a21fe8b8b794745669b147..424169bbc512798c66c9422f1f92bf243caaae57 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,155 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-PADDLE_ROOT=/home
-mkdir ${PADDLE_ROOT}
-cd ${PADDLE_ROOT}
-pip install /paddle/build/opt/paddle/share/wheels/*.whl
-git clone https://github.com/PaddlePaddle/FluidDoc
-git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-cd  ${PADDLE_ROOT}/PaddlePaddle.org
-git reset 3feaa68376d8423e41d076814e901e6bf108c705
-cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api
-sh gen_doc.sh
-apt-get update && apt-get install -y python-dev build-essential
-cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
-pip install -r requirements.txt
-#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI.
-sed -i "s#8000#$1#g" runserver
-nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc &
+is_shell_attribute_set() { # attribute, like "x"
+  case "$-" in
+    *"$1"*) return 0 ;;
+    *)    return 1 ;;
+  esac
+}
+function get_docs_pr_num_from_paddle_pr_info(){
+    # get_repo_pr_info's output
+    pr_info_file=$1
+    if [ ! -r ${pr_info_file} ] ; then
+        return 1
+    fi
+
+    declare -A arr_kv
+    while read line
+    do
+        echo "$line" | grep '^\w\+\s*=\s*.*' > /dev/null
+        if [ $? = 0 ] ; then
+            kv=($(echo $line | sed 's/=/\n/g'))
+            k=($(echo "${kv[0]}" | sed 's/\s//g'))
+            v=($(echo "${kv[1]}" | sed 's/^\s*//g' | sed 's/\s*$//g'))
+            # arr_kv[${kv[1]}]=${kv[2]}
+            arr_kv[${k}]=${v}
+        fi
+    done < <(jq -r '.body' ${pr_info_file})
+
+    echo ${arr_kv[PADDLEDOCS_PR]}
+    return 0
+}
+
+# Attention:
+# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. 
+# 2. And /docs is used as the output of doc-build process.
+# 3. If conflicted with yours, please modify the defination of FLUIDDOCDIR and
+#    OUTPUTDIR in the subsequent codes.
+# 4. The doc-build process is controlled under EnvVar BUILD_DOC and UPLOAD_DOC.
+#    All the Chinese and English docs will be generated, and then uploaded.
+
+PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: None"
+BUILD_DOC=${BUILD_DOC:=false}
+UPLOAD_DOC=${UPLOAD_DOC:=false}
+
+CURPWD=${PWD}
+
+if [ -f /usr/local/python3.7.0/bin/sphinx-build ] ; then
+    if [ -f /usr/local/bin/sphinx-build ] ; then
+        rm /usr/local/bin/sphinx-build
+    fi
+    ln -s /usr/local/python3.7.0/bin/sphinx-build /usr/local/bin/sphinx-build
+fi
+
+if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
+    export FLUIDDOCDIR=${FLUIDDOCDIR:=/FluidDoc}
+    export OUTPUTDIR=${OUTPUTDIR:=/docs}
+    export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g')
+
+    if [ -d ${FLUIDDOCDIR} ] ; then
+        echo "${FLUIDDOCDIR} exists, git clone will be skipped, but git clean will be done."
+        cd ${FLUIDDOCDIR}
+        git reset --hard
+        git clean -dfx
+        cd ${CURPWD}
+    else
+        git clone -b ${BRANCH} --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        if [ ! "$?" = "0" ] ; then
+            git clone --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        fi
+    fi
+    if [ -d ${OUTPUTDIR} ] ; then
+        echo "$0: rm -rf ${OUTPUTDIR}"
+        rm -rf ${OUTPUTDIR}
+        mkdir -p ${OUTPUTDIR}
+    fi
+
+    # install requirements
+    export no_proxy=mirror.baidu.com,${no_proxy}
+    apt-get install -y --no-install-recommends doxygen jq
+    echo 'beautifulsoup4
+Markdown
+sphinx-sitemap
+sphinx-markdown-tables
+breathe
+exhale
+sphinx_design
+nbsphinx
+' >/tmp/doc-build.requirements && \
+    pip install --no-cache-dir -i https://mirror.baidu.com/pypi/simple -r /tmp/doc-build.requirements && \
+    rm /tmp/doc-build.requirements
+
+
+    source ${FLUIDDOCDIR}/ci_scripts/utils.sh
+    paddle_pr_info=$(get_repo_pr_info "PaddlePaddle/Paddle" ${GIT_PR_ID})
+    docs_pr_id=$(get_docs_pr_num_from_paddle_pr_info ${paddle_pr_info})
+    if [ -n "${docs_pr_id}" ] ; then
+        cd ${FLUIDDOCDIR}
+        git fetch --depth=1 origin pull/${docs_pr_id}/head
+        git checkout -b "pr${docs_pr_id}" FETCH_HEAD
+        git log --pretty=oneline -10
+    fi
+    echo "docs_pr_id=${docs_pr_id}"
+
+
+    # build doc
+    /bin/bash -x ${FLUIDDOCDIR}/ci_scripts/gendoc.sh
+    if [ $? -ne 0 ];then
+        echo 'gendoc error'
+        exit 1
+    fi
+
+    if [ "${UPLOAD_DOC}" = "true" ] ; then
+        curl -o /tmp/linux-bcecmd-0.3.0.zip https://sdk.bce.baidu.com/console-sdk/linux-bcecmd-0.3.0.zip && \
+        python -m zipfile -e /tmp/linux-bcecmd-0.3.0.zip /opt && \
+        chmod +x /opt/linux-bcecmd-0.3.0/bcecmd && \
+        rm /tmp/linux-bcecmd-0.3.0.zip && \
+        curl -o /tmp/boscmdconfig.tgz https://paddle-dev-tools-open.bj.bcebos.com/fluiddoc-preview/boscmdconfig.tgz && \
+        tar xzf /tmp/boscmdconfig.tgz -C /opt/linux-bcecmd-0.3.0/ && \
+        rm /tmp/boscmdconfig.tgz
+
+        # credentials file is empty, please build it if need.
+        BCECMD=/opt/linux-bcecmd-0.3.0/bcecmd
+        BCECMD_CONFIG=/opt/linux-bcecmd-0.3.0/boscmdconfig
+
+        is_shell_attribute_set x
+        xdebug_setted=$?
+        if [ $xdebug_setted ] ; then
+            set +x
+        fi
+        if [ -n "${BOS_CREDENTIAL_AK}" ] && [ -n "${BOS_CREDENTIAL_SK}" ] ; then
+            echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials
+            echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials
+        fi
+        if [ $xdebug_setted ] ; then
+            set -x
+        fi
+
+        PREVIEW_JOB_NAME="preview-paddle-pr-${GIT_PR_ID}"
+        BOSBUCKET=${BOSBUCKET:=paddle-site-web-dev}
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: http://${PREVIEW_JOB_NAME}.${PREVIEW_SITE:-paddle.run}/documentation/docs/zh/api/index_cn.html"
+    fi
+fi
+
+cd ${CURPWD}
+# print the preview url
+echo "${PREVIEW_URL_PROMPT}"
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index 83e2957831296932242e984f83110fba897e5c03..f754767259563f2cd64bac92adf76249b18af11f 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -1,4 +1,4 @@
-def PD_FeedOp : PD_Op<"feed"> {
+def PD_FeedOp : PD_Op<"feed", [NoSideEffect]> {
   let summary = "Feed Op";
 
   let description = [{
@@ -33,7 +33,7 @@ def PD_ReturnOp : PD_Op<"return", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py
index 03cf6828cc7e19eedd7b2cd1375b93859e9f3cfa..0d633cfc60a9b6cddc669da0dbc87667f8211714 100644
--- a/tools/infrt/fake_models/multi_fc.py
+++ b/tools/infrt/fake_models/multi_fc.py
@@ -19,7 +19,6 @@ import sys, os
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.backward import append_backward
 
 size = 2
 num_layers = 4
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 80cf3958b156d79b4b1253d198042c63458663b7..36561d4e71da8b669f1e06b0240a4d4b3b2ca92e 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -14,9 +14,16 @@
 
 import json
 import sys
-
-attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'}
-supported_kernels = ['sign', 'dot', 'digamma', 'conj']
+import os
+from get_compat_kernel_signature import get_compat_kernels_info
+
+#TODO @DannyIsFunny: more attr types need to be supported.
+attr_type_converter = {
+    "i": 'SI32Attr',
+    "b": 'BoolAttr',
+    "l": 'SI64Attr',
+    "f": 'F32Attr'
+}
 
 target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
 layout_type_converter = {
@@ -39,39 +46,34 @@ precision_type_converter = {
     "bool": "BOOL"
 }
 
+kernel_types_info_file = "./kernels.json"
+kernel_signature_info_file = "./kernel_signature.json"
+
 
 def generate_kernel_name(op_name, place_str):
     [target_, layout_, precision_] = place_str[1:-1].split(',')
     target_ = target_type_converter[target_.strip()]
     layout_ = layout_type_converter[layout_.strip()]
     precision_ = precision_type_converter[precision_.strip()]
+    class_name_ = "{}{}".format(
+        op_name.replace("_", "").title(), "".join([
+            target_.strip().title(), precision_.strip(), layout_.strip().title()
+            .title()
+        ]))
     alias_ = "{}.{}".format(op_name, ".".join(
-        [target_.strip(), layout_.strip(), precision_.strip()]))
-    return alias_
+        [target_.strip(), precision_.strip(), layout_.strip()]))
+    return alias_, class_name_
 
 
 def generate_attrs_info(op_name, attrs_info):
-    kernel_attrs_names = {
-        'split': ['sections', 'num', 'axis', 'mkldnn_data_type'],
-        'sign': [],
-        'masked_select': [],
-        'trace': ['offset', 'axis1', 'axis2'],
-        'concat': ['axis'],
-        'empty': ['shape', 'dtype'],
-        'conj': [],
-        'norm': ['axis', 'epsilon', 'is_test'],
-        'histogram': ['bins', 'min', 'max'],
-        'dot': [],
-        'scale': ['scale', 'bias', 'bias_after_scale'],
-        'digamma': [],
-        'lerp': [],
-        'cast': ['out_dtype', 'in_dtype'],
-        'abs': []
-    }
+    kernel_attrs_names = {}
     attrs_args_ = ""
-    if len(kernel_attrs_names[op_name]) == len(attrs_info):
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
+    if len(kernel_attrs_names[op_name]["attrs"]) == len(attrs_info):
         for index in range(len(attrs_info)):
-            attr_name = kernel_attrs_names[op_name][index]
+            attr_name = kernel_attrs_names[op_name]["attrs"][index]
             attr_type = attr_type_converter[attrs_info[index]]
             attrs_args_ += '{type_}:${name_},'.format(
                 type_=attr_type, name_=attr_name)
@@ -95,8 +97,12 @@ def generate_inputs_info(input_info):
 def generate_arguments_info(op_name, input_info, attr_info):
     input_args = generate_inputs_info(input_info)
     attr_args = generate_attrs_info(op_name, attr_info)
-    context_args = "CPU_Context:$dev_ctx"
-    argument_ = "{},{},{}".format(context_args, input_args, attr_args)
+    context_args = "Context:$dev_ctx"
+    argument_list = [context_args] + input_args.split(",") + attr_args.split(
+        ",")
+    while ("" in argument_list):
+        argument_list.remove("")
+    argument_ = ",".join(argument_list)
     return (("let arguments = (ins {});".format(argument_.strip(","))))
 
 
@@ -115,6 +121,10 @@ def generate_results_info(output_info):
 
 def generate_supported_kernel_list(load_dict):
     supported_kernels_list_ = []
+    kernel_attrs_names = {}
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
     for op_name in load_dict:
         kernel_list = load_dict[op_name]
         for kernel_info in kernel_list:
@@ -124,13 +134,10 @@ def generate_supported_kernel_list(load_dict):
                 for attribute in attributes:
                     if attribute not in attr_type_converter:
                         flag = False
-                if flag:
+                if flag and op_name in kernel_attrs_names:
                     supported_kernels_list_.append(op_name)
-
-                alias_ = generate_kernel_dialect(op_name, kernel_alias_,
-                                                 kernel_info[kernel_alias_])
     supported_kernels_list_ = list(set(supported_kernels_list_))
-    print(supported_kernels_list_)
+    return supported_kernels_list_
 
 
 def scan_kernel_info(load_dict):
@@ -155,16 +162,14 @@ def scan_kernel_info(load_dict):
 
 def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
-    alias = generate_kernel_name(op_name, kernel_alias_)
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
     summary = 'let summary = "{name}";'.format(name=alias)
     dialect_name = alias.split(".")
     dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
         3]
 
     header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
-        kernel_name=alias.replace(".", ""),
-        name=dialect_name.lower(),
-        left_brace="{")
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
 
     inputs_ = kernel_info["input"]
     attributes = kernel_info["attribute"]
@@ -184,16 +189,14 @@ def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
 def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
-    alias = generate_kernel_name(op_name, kernel_alias_)
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
     summary = 'let summary = "{name}";'.format(name=alias)
     dialect_name = alias.split(".")
     dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
         3]
 
     header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
-        kernel_name=alias.replace(".", ""),
-        name=dialect_name.lower(),
-        left_brace="{")
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
     inputs_ = kernel_info["input"]
     attributes = kernel_info["attribute"]
     arguments = generate_arguments_info(op_name, inputs_, attributes)
@@ -235,14 +238,17 @@ def get_kernel_target(kernel_alias_):
     return target[0]
 
 
-def main(path_):
-    with open(path_, "r") as f:
+def main():
+    with open(kernel_types_info_file, "r") as f:
         load_dict = json.load(f)
 
         head = generate_dialect_head()
 
         cpu_registry_ = ""
         gpu_registry_ = ""
+        supported_kernels = generate_supported_kernel_list(load_dict)
+        print("Supported kernels:")
+        print(supported_kernels)
         for op_name in load_dict:
             if op_name not in supported_kernels:
                 continue
@@ -272,5 +278,12 @@ def main(path_):
 
 
 if __name__ == '__main__':
-    path = sys.argv[1]
-    main(path)
+    if not os.path.exists(kernel_types_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_types_info_file))
+    if not os.path.exists(kernel_signature_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_signature_info_file))
+    if os.path.exists(kernel_types_info_file) and os.path.exists(
+            kernel_signature_info_file):
+        main()
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
new file mode 100644
index 0000000000000000000000000000000000000000..0680e87b38b3f6c29e7f813474d947598912437d
--- /dev/null
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import json
+
+skip_list = []
+
+
+def parse_compat_registry(kernel_info):
+    name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{")
+    kernel_info = {}
+    kernel_info["inputs"] = inputs_str[:-1].split(",")
+    kernel_info["attrs"] = attrs_str[:-1].split(",")
+    kernel_info["outputs"] = outputs_str[:-1].split(",")
+    return name, kernel_info
+
+
+def remove_grad_registry(kernels_registry):
+    clean_kernel_registry = {}
+    for registry in kernels_registry:
+        if (not "_grad" in registry):
+            clean_kernel_registry[registry] = kernels_registry[registry]
+    return clean_kernel_registry
+
+
+def get_compat_kernels_info():
+    kernels_info = {}
+    compat_files = os.listdir("../../paddle/phi/ops/compat")
+    for file_ in compat_files:
+        if not ".cc" in file_:
+            compat_files.remove(file_)
+
+    for file_ in compat_files:
+        if file_ in skip_list:
+            continue
+        with open("../../paddle/phi/ops/compat/" + file_) as in_file:
+            txt = in_file.readlines()
+            content = ""
+            registry = False
+            for line in txt:
+                if ("KernelSignature(" in line):
+                    content = ""
+                    registry = True
+                if (registry):
+                    content += line
+                if (registry and ";" in line):
+                    data = content.replace("\n", "").replace(
+                        " ",
+                        "").strip("return").strip("KernelSignature(").strip(
+                            "\);").replace("\"", "").replace("\\", "")
+                    registry = False
+                    name, registry_info = parse_compat_registry(data)
+
+                    if name in kernels_info:
+                        cur_reg = kernels_info[name]
+                        kernels_info[name]["inputs"] = list(
+                            set(registry_info["inputs"] + kernels_info[name][
+                                "inputs"]))
+                        kernels_info[name]["attrs"] = list(
+                            set(registry_info["attrs"] + kernels_info[name][
+                                "attrs"]))
+                        kernels_info[name]["outputs"] = list(
+                            set(registry_info["outputs"] + kernels_info[name][
+                                "outputs"]))
+                    else:
+                        kernels_info[name] = registry_info
+
+    compat_registry_ = remove_grad_registry(kernels_info)
+    return compat_registry_
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 9ea3fef003054c2091553f6168299a450a94d8b2..774f6cd6bf3648a0de7a34e01e893d212bce9770 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[
-                2].lower() + '.' + ir_dtype
+            ir_name = 'phi_cpu.' + update_item[0].lower(
+            ) + '.' + ir_dtype + '.' + update_item[2].lower()
             res += f"""
   registry->AddKernel("{ir_name}","""
 
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d8cb70c9dd107b076648bd7ae032bd7f3db573df..2d8692c5bc7e5c7389af5b07afebbc6de7d96ced 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -550,6 +550,42 @@ def get_incrementapi():
                 f.write('\n')
 
 
+def exec_gen_doc():
+    result = True
+    cmd = ["bash", "document_preview.sh"]
+    logger.info("----exec gen_doc----")
+    start_time = time.time()
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+    end_time = time.time()
+
+    if subprc.returncode != 0:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.error("----gen_doc error msg----")
+        logger.error(err)
+        logger.error("----exec gen_doc failed----")
+        result = False
+    else:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.info("----exec gen_doc success----")
+
+    for fn in [
+            '/docs/en/develop/index_en.html', '/docs/zh/develop/index_cn.html'
+    ]:
+        if os.path.exists(fn):
+            logger.info('%s exists.', fn)
+        else:
+            logger.error('%s not exists.', fn)
+
+    # msg is the returned code execution report
+    return result, msg, end_time - start_time
+
+
 arguments = [
     # flags, dest, type, default, help
     ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
@@ -570,6 +606,11 @@ def parse_args():
     parser.add_argument('--debug', dest='debug', action="store_true")
     parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    parser.add_argument(
+        '--build-doc',
+        dest='build_doc',
+        action='store_true',
+        help='build doc if need.')
     for item in arguments:
         parser.add_argument(
             item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
@@ -702,3 +743,7 @@ if __name__ == '__main__':
             exit(1)
 
     logger.info("Sample code check is successful!")
+
+    if args.mode == "cpu":
+        # As cpu mode is also run with the GPU whl, so skip it in gpu mode.
+        exec_gen_doc()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 7356f0c8db02551c930e424571cf779f0c3dbc9c..365047f7e8382afa1646df2d4ff491471fa829c2 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -729,4 +730,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_lu_op',
     'test_margin_cross_entropy_op',
     'test_pull_gpups_sparse_op',
+    'test_fused_gemm_epilogue_op',
+    'test_fused_gemm_epilogue_grad_op',
 ]